In [231]:
from __future__ import print_function

# Jupyter display
from IPython.display import display

# json
import json

# widgets
import ipywidgets as widgets
import bqplot as bq
import ipyleaflet as ll

# numerics
import pandas as pd
import numpy as np
from sklearn import linear_model

# colormap
import matplotlib as mpl
import matplotlib.cm
import matplotlib.colors

def n_colors(n, colormap=mpl.cm.Blues):
    data = np.linspace(0.0,1.0,n)
    c = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(data)]
    return c

def data_to_colors(data, colormap=mpl.cm.plasma):
    c = [mpl.colors.rgb2hex(d[0:3]) for d in colormap(mpl.colors.Normalize()(data))]
    return c

In [232]:
data = pd.read_csv('./../analysis_data/merged_data.csv')

In [233]:
race_data = data[['hispanic', 'white',
       'black', 'american_indian', 'asian', 'pac_islander', 'other_races', 'two_races']]

In [234]:
race_data = race_data / 100.

In [235]:
result = data['score']

In [236]:
clf = linear_model.LinearRegression(fit_intercept=False)

In [237]:
clf.fit(race_data.values, result.values)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [238]:
clf.coef_

array([  8.53779252,   7.23122887,   6.87597854, -11.30570004,
         8.33566622,  13.47011833,  22.60022925,  25.02258295])

In [239]:
clf.intercept_

0.0

In [240]:
race_corrected_scores = result.values - clf.predict(race_data.values)

In [241]:
race_corrected_scores

array([-0.66824351, -0.13678397, -0.68460998, -0.12926007,  0.18275362,
       -0.53382416,  0.09590031,  0.08165585,  0.31821955,  0.07203872,
        0.08359963,  0.44225876,  0.02452543, -0.13185275, -0.24675444,
       -0.10308096,  0.07683122,  0.21375581,  0.29287825,  0.93558465,
       -0.06722998, -0.06038546, -0.06514861,  0.31351156, -0.30858819])

In [242]:
race_corrected_data = data.copy(deep=True)

In [243]:
race_corrected_data.rename(columns={'Unnamed: 0': 'zipcode'}, inplace=True)

In [244]:
race_corrected_data['score'] = race_corrected_scores

In [245]:
race_corrected_data.to_csv('./../analysis_data/race_corrected_merged_data.csv')

In [246]:
race_corrected_data.corr().ix[:, -1]

zipcode                0.235022
pct_bachelors         -0.007863
labor_part_rate        0.435326
pct_welfare            0.398628
low_pov_idx            0.426255
labor_idx              0.002488
env_health_idx         0.282486
hispanic               0.000420
white                  0.000682
black                  0.000238
american_indian        0.000211
asian                  0.000496
pac_islander           0.000355
other_races            0.000304
two_races              0.000700
housing_cost          -0.282558
transportation_cost    0.559355
pub_school_score      -0.127418
pr_school_score       -0.286654
rest_score            -0.210170
rest_proximity        -0.485048
crime_index           -0.474078
score                  1.000000
Name: score, dtype: float64

## TEST

In [247]:
other_race_data = data[['hispanic',
       'black', 'american_indian', 'asian', 'pac_islander', 'other_races', 'two_races']]
white_data = data[['white']]

In [248]:
other_race_data = other_race_data / 100.
white_data = white_data / 100.

In [249]:
other_races = other_race_data.sum(axis=1)

In [250]:
clf2 = linear_model.LinearRegression(fit_intercept=False)

In [251]:
clf.fit(np.array([other_races.values, white_data.values.flatten()]).T, result.values)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [252]:
other_race_corrected_scores = result.values - \
            clf.predict(np.array([other_races.values, white_data.values.flatten()]).T)

In [253]:
other_race_corrected_scores

array([-0.37893909, -0.24284901, -0.8797087 , -0.17084286,  0.12171745,
       -0.6107209 ,  0.14540864,  0.26859612,  0.31393066, -0.01587959,
       -0.11789936,  0.5411346 ,  0.18897944,  0.00228196,  0.1387698 ,
        0.1077993 , -0.22842552, -0.07852794,  0.35971443,  0.98541225,
       -0.04101819,  0.18391665, -0.20202538,  0.32261695, -0.71620061])

In [254]:
other_race_corrected_data = data.copy(deep=True)

In [255]:
other_race_corrected_data.rename(columns={'Unnamed: 0': 'zipcode'}, inplace=True)

In [256]:
other_race_corrected_data['score'] = other_race_corrected_scores

In [257]:
race_corrected_data.corr().ix[:, -1]

zipcode                0.235022
pct_bachelors         -0.007863
labor_part_rate        0.435326
pct_welfare            0.398628
low_pov_idx            0.426255
labor_idx              0.002488
env_health_idx         0.282486
hispanic               0.000420
white                  0.000682
black                  0.000238
american_indian        0.000211
asian                  0.000496
pac_islander           0.000355
other_races            0.000304
two_races              0.000700
housing_cost          -0.282558
transportation_cost    0.559355
pub_school_score      -0.127418
pr_school_score       -0.286654
rest_score            -0.210170
rest_proximity        -0.485048
crime_index           -0.474078
score                  1.000000
Name: score, dtype: float64

In [258]:
other_race_corrected_data.corr().ix[:, -1]

zipcode                0.085783
pct_bachelors         -0.021863
labor_part_rate        0.304789
pct_welfare            0.320157
low_pov_idx            0.444121
labor_idx              0.038270
env_health_idx         0.574074
hispanic              -0.020540
white                  0.000730
black                 -0.146189
american_indian       -0.170569
asian                  0.052830
pac_islander          -0.036748
other_races            0.216303
two_races              0.283502
housing_cost          -0.304907
transportation_cost    0.575663
pub_school_score      -0.336108
pr_school_score       -0.553049
rest_score            -0.464413
rest_proximity        -0.512932
crime_index           -0.456346
score                  1.000000
Name: score, dtype: float64

In [259]:
other_race_corrected_data.to_csv('./../analysis_data/other_race_corrected_merged_data.csv')