In [1]:
#Based on this tutorial - http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html

import pandas as pd
import pylab as pl
import numpy as np 
import geopandas as gp
%pylab inline

Populating the interactive namespace from numpy and matplotlib


### Data munging for all DV routes

In [2]:
demographics = pd.read_excel('../data/DemoData.xlsx')
demographics['tract'] = demographics.ID.astype(str).str.zfill(9)

In [3]:
shp = gp.GeoDataFrame.from_file('../data/merged.json')

shp = shp.merge(demographics, on='tract')

shp.rename(columns={
    'Destination?': 'is_dest',
    'Origin?': 'is_orig',
    'Total Population': 'pop',
    '18 to 24 yrs': '_18_to_24',
    '65 and Above': 'above_65',
    'Median Age': 'age',
    'Non-white': 'nonwhite',
    'English less than "very well"': 'nonenglish',
    'Annaul Individual Income below 10000': 'income_below_10000',
    'Annaul Individual Income below 15000': 'income_below_15000',
    'Median Income': 'income',
    'Below 100 percent of the poverty level': 'below_pov',
    '100 to 149 percent of the poverty level': '_100_149_pov',
    'At or above 150 percent of the poverty level': 'above_150_pov',
    'Citizenship status': 'noncitizen',
    'Place of Birth': 'foreignborn'
}, inplace=True)

In [4]:
lehd = pd.read_csv('../data/merged.csv', dtype={'tract': str})

lehd = lehd.drop([u'Unnamed: 0', u'Both', u'Destination?', u'DollarVanLine', u'Origin?',
       u'average_commute_time', u'average_walk_distance', u'geometry',
       u'number_of_commuters',  u'w_county_tract', u'h_county_tract'], 1)

shp = shp.merge(lehd, on='tract')

In [5]:
shp.dropna(how="all", inplace=True)

In [6]:
routes = pd.read_csv('../data/csv/masterroutes.csv', dtype={'CT2010': str, 'BoroCode': str})

routes['county_code'] = ""
routes['county_code'][routes['BoroCode'] == '1'] = '061' # Manhattan
routes['county_code'][routes['BoroCode'] == '2'] = '005' # Bronx
routes['county_code'][routes['BoroCode'] == '3'] = '047' # Brooklyn
routes['county_code'][routes['BoroCode'] == '4'] = '081' # Queens
routes['county_code'][routes['BoroCode'] == '5'] = '085' # Staten Island

routes['tract'] = routes['county_code'] + routes['CT2010']

routes.drop(['FID',
 'FID_1',
 'CTLabel',
 'BoroCode',
 'BoroName',
 'CT2010',
 'BoroCT2010',
 'CDEligibil',
 'NTACode',
 'NTAName',
 'PUMA',
 'Shape_Leng',
 'Shape_Area',
 'Count_'
], 1, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
routes_data = shp.merge(routes, on = 'tract')

In [8]:
routes_data.columns

Index([                 u'Both',               u'is_dest',
               u'DollarVanLine',               u'is_orig',
        u'average_commute_time', u'average_walk_distance',
                    u'geometry',   u'number_of_commuters',
                       u'tract',                    u'ID',
                   u'Geography',                   u'pop',
                   u'_18_to_24',              u'above_65',
                         u'age',              u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                   u'below_pov',          u'_100_149_pov',
               u'above_150_pov',            u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29

### Individual routes
#### Chinatown

In [9]:
#Origins
china_data_o = shp.ix[:,4:]
labels = shp['DollarVanLine']
china_data_o = china_data_o[[
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                  u'noncitizen',           u'foreignborn',
              u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]]
china_data_o = routes_data[routes_data['OriginChina'] == 1]

#Dropping poverty because nans while computing covariance matrix
china_data_std_o = china_data_o

# Standardize Value Range

for c in [
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                 u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]:
    china_data_std_o[c] = ((china_data_std_o[c] - china_data_std_o[c].mean())/china_data_std_o[c].std())
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
#Destination
china_data_d = shp.ix[:,4:]
labels = shp['DollarVanLine']
china_data_d = china_data_d[[
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                  u'noncitizen',           u'foreignborn',
              u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]]
china_data_d = routes_data[routes_data['ChinaDest'] == 1]

#Dropping poverty because nans while computing covariance matrix
china_data_std_d = china_data_d

# Standardize Value Range

for c in [
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                 u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]:
    china_data_std_d[c] = ((china_data_std_d[c] - china_data_std_d[c].mean())/china_data_std_d[c].std())
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### Flatbush

In [11]:
#Origins
flat_route_o = shp.ix[:,4:]
labels = shp['DollarVanLine']
flat_route_o = flat_route_o[[
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                  u'noncitizen',           u'foreignborn',
              u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]]
flat_route_o = routes_data[routes_data['OriginFlat'] == 1]

#Dropping poverty because nans while computing covariance matrix
flat_route_std_o = flat_route_o

# Standardize Value Range

for c in [
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                 u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]:
    flat_route_std_o[c] = ((flat_route_std_o[c] - flat_route_std_o[c].mean())/flat_route_std_o[c].std())
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
#Destination
flat_route_d = shp.ix[:,4:]
labels = shp['DollarVanLine']
flat_route_d = flat_route_d[[
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                  u'noncitizen',           u'foreignborn',
              u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]]
flat_route_d = routes_data[routes_data['FlatbushDest'] == 1]

#Dropping poverty because nans while computing covariance matrix
flat_route_std_d = flat_route_d

# Standardize Value Range

for c in [
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                 u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]:
    flat_route_std_d[c] = ((flat_route_std_d[c] - flat_route_std_d[c].mean())/flat_route_std_d[c].std())
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### Jamaica

In [19]:
#Destination

jam_route_o = shp.ix[:,4:]
labels = shp['DollarVanLine']
jam_route_o = jam_route_o[[
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                  u'noncitizen',           u'foreignborn',
              u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]]
jam_route_o = routes_data[routes_data['OriginJamica'] == 1]

#Dropping poverty because nans while computing covariance matrix
jam_route_std_o = jam_route_o

# Standardize Value Range

for c in [
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                 u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]:
    jam_route_std_o[c] = ((jam_route_std_o[c] - jam_route_std_o[c].mean())/jam_route_std_o[c].std())
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
#Destination
jam_route_d = shp.ix[:,4:]
labels = shp['DollarVanLine']
jam_route_d = jam_route_d[[
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                  u'noncitizen',           u'foreignborn',
              u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]]
jam_route_d = routes_data[routes_data['Jamica'] == 1]

#Dropping poverty because nans while computing covariance matrix
jam_route_std_d = jam_route_d

# Standardize Value Range

for c in [
        u'average_commute_time', u'average_walk_distance',
        u'number_of_commuters',u'pop',
                       u'_18_to_24',                    u'above_65',
                         u'age',             u'nonwhite',
                  u'nonenglish',    u'income_below_10000',
          u'income_below_15000',                u'income',
                 u'noncitizen',
                 u'foreignborn',        u'wac_total_jobs',
              u'wac_jobs_lt_29',        u'wac_jobs_30_54',
              u'wac_jobs_gt_55',      u'wac_minc_lt_1250',
          u'wac_minc_1251_3333',      u'wac_minc_gt_3333',
              u'rac_total_jobs',        u'rac_jobs_lt_29',
              u'rac_jobs_30_54',        u'rac_jobs_gt_55',
            u'rac_minc_lt_1250',    u'rac_minc_1251_3333',
            u'rac_minc_gt_3333'
]:
    jam_route_std_d[c] = ((jam_route_std_d[c] - jam_route_std_d[c].mean())/jam_route_std_d[c].std())
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### PCA 

In [21]:
def pca(data, data_std):
    mean_vec = np.mean(data_std, axis=0)
    cov_mat = (data_std - mean_vec).T.dot((data_std - mean_vec)) / (data_std.shape[0]-1)
    #print('Covariance matrix \n%s' %cov_mat)

    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    cor_mat1 = np.corrcoef(data_std.T)

    eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

    #print('Eigenvectors \n%s' %eig_vecs)
    #print('\nEigenvalues \n%s' %eig_vals)

    cor_mat2 = np.corrcoef(data.T)

    eig_vals, eig_vecs = np.linalg.eig(cor_mat2)
    u,s,v = np.linalg.svd(data_std.T)

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

    eig_pairs.sort()
    eig_pairs.reverse()

    #print('Eigenvalues in descending order:')
    for i in eig_pairs:
        print(i[0], i[1])# max(i[1]), np.mean(i[1]), np.std(i[1]), min(i[1]) )

In [25]:
pca(jam_route_o, jam_route_std_o)
print len(jam_route_o)
print len(jam_route_std_o)

ValueError: operands could not be broadcast together with shapes (64) (4) 

In [23]:
pca(china_data_o, china_data_std_o)

ValueError: operands could not be broadcast together with shapes (216) (4) 

In [26]:
mean_vec = np.mean(jam_route_std_o, axis=0)
cov_mat = (jam_route_std_o - mean_vec).T.dot((jam_route_std_o - mean_vec)) / (jam_route_std_o.shape[0]-1)
#print('Covariance matrix \n%s' %cov_mat)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)
cor_mat1 = np.corrcoef(data_std.T)

eig_vals, eig_vecs = np.linalg.eig(cor_mat1)

#print('Eigenvectors \n%s' %eig_vecs)
#print('\nEigenvalues \n%s' %eig_vals)

cor_mat2 = np.corrcoef(data.T)

eig_vals, eig_vecs = np.linalg.eig(cor_mat2)
u,s,v = np.linalg.svd(data_std.T)

# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

eig_pairs.sort()
eig_pairs.reverse()

#print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0], i[1])# max(i[1]), np.mean(i[1]), np.std(i[1]), min(i[1]) )

ValueError: operands could not be broadcast together with shapes (64) (4) 

### Features

In [None]:
jam_route_std_o.columns[np.abs([-0.24744552, -0.22750989,  0.26752878,  0.23646118, -0.01404069,
       -0.00187555, -0.002666  , -0.08972109, -0.05356957, -0.06964585,
       -0.07788279,  0.15675925, -0.05945666, -0.01701496,  0.18997601,
        0.19389964,  0.18683487,  0.18248547,  0.19743281,  0.19256351,
        0.17890414,  0.28774482,  0.27153521,  0.28622749,  0.25451958,
        0.2163573 ,  0.1689631 ,  0.28377035]) > 0.20]

In [None]:
# Not Using this

matrix_w = np.hstack((eig_pairs[0][1].reshape(28,1),
                      eig_pairs[1][1].reshape(28,1)))

#print('Matrix W:\n', matrix_w)
Y = data_std.dot(matrix_w)
from sklearn.decomposition import PCA as sklearnPCA
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(data_std)
from sklearn.decomposition import SparsePCA as sparsePCA
sparse_pca = sparsePCA(n_components=4)
sparse_pca.fit(data_std)
Y_sparse = sparse_pca.transform(data_std)

In [None]:
len()

In [None]:
routes_data.columns

In [None]:
len(jam_route_o)