In [None]:
import pandas as pd

df = pd.read_csv(filepath_or_buffer='./communities.data', sep=',', header=None, na_values='?', keep_default_na=True)
column_names = ['state', 'county', 'community', 'communityname', 'fold', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'LandArea', 'PopDens', 'PctUsePubTrans', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'LemasPctOfficDrugUn', 'PolicBudgPerPop', 'ViolentCrimesPerPop']
df.columns = column_names
df

In [None]:
# Do GBR, Ridge, Lasso on all 122 variables (same as last week)
y = df['ViolentCrimesPerPop'].copy()
X = df.drop(
    columns=['state', 'county', 'community', 'communityname', 'fold', 'ViolentCrimesPerPop']
)

In [None]:
# Lasso
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


grid = {
    'estimator__alpha': np.logspace(-3, 1, 100)
}

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Lasso())
])

# Note: The GridSearchCV only sees 2/3 of the data (each iteration), and further divides this into 1/3 and 2/3
# for hyperparameter optimisation

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)

# Note: by doing the gridsearchCV within the cross-validate function –  this is how we’re doing the nestedCV
# This is different to what we did last week - because last week we only split the data once (we do it three times here)
# The 'cross_validate' function splits the data in test (1/3) and train (2/3), 
# then passes the training set on to GridSearchCV to do it's own hyperparmater optimisation (using CV). 
# This is repeated 3 times.

# The cv_results in:
# test_score = cv_result['test_score'] is based on the R-squared using the best model from the 
# GridSearchCV on the data initially held out (33% of the imputed data). (3 values)
# train_score = cv_result['train_score'] is based on the R-squared from the best model from the
# GridSearchCV but on the data used by the GridSearchCV (66% of the imputed (for missing) data)
 

cv_result = cross_validate(estimator=grid_search_cv, X=X, y=y, cv=3, return_train_score=True)
test_score = cv_result['test_score']
train_score = cv_result['train_score']
plt.bar(x=range(2), 
        height = [np.mean(train_score), np.mean(test_score)], 
        tick_label=['Train', 'Test'], 
        yerr=[np.std(train_score), np.std(test_score)])
plt.title('Lasso CV')
plt.show()

_ = grid_search_cv.fit(X=X, y=y)

In [None]:
# Now check the predictions from our Lasso
y_pred = grid_search_cv.predict(X=X)
pred_df = pd.DataFrame(data=y_pred, columns=['Lasso'], index=X.index)

In [None]:
# Ridge
from sklearn.linear_model import Ridge

grid = {
    'estimator__alpha': np.logspace(-3, 2, 100)
}

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Ridge())
])

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
cv_result = cross_validate(grid_search_cv, X=X, y=y, cv=3, return_train_score=True)
test_score = cv_result['test_score']
train_score = cv_result['train_score']
plt.bar(x=range(2), height = [np.mean(train_score), np.mean(test_score)], tick_label=['Train', 'Test'], 
        yerr=[np.std(train_score), np.std(test_score)])
plt.title('Ridge CV')
plt.show()

grid_search_cv.fit(X=X, y=y)
y_pred = grid_search_cv.predict(X=X)
pred_df['Ridge'] = y_pred

In [None]:
# GBR
from sklearn.ensemble import GradientBoostingRegressor

grid = {
    # 'estimator__learning_rate': np.logspace(-2, 1, 3),
    'estimator__learning_rate': np.logspace(-2, -0.5, 3),
    'estimator__min_samples_split': np.logspace(-2, 0, 3),
    'estimator__max_depth': np.arange(1, 5, 2)
}

pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('standardise', StandardScaler()),
    ('estimator', GradientBoostingRegressor(random_state=42))
])

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
cv_result = cross_validate(estimator=grid_search_cv, X=X, y=y, cv=3, return_train_score=True)
test_score = cv_result['test_score']
train_score = cv_result['train_score']
plt.bar(x=range(2), 
        height = [np.mean(train_score), np.mean(test_score)], 
        tick_label=['Train', 'Test'], 
        yerr=[np.std(train_score), np.std(test_score)])
plt.title('GBR CV')
plt.show()

grid_search_cv.fit(X=X, y=y)
pred = grid_search_cv.predict(X=X)
pred_df['GBR'] = pred

In [None]:
# Now drop variables involving race
# racepctblack: percentage of population that is african american (numeric - decimal)
# racePctWhite: percentage of population that is caucasian (numeric - decimal)
# racePctAsian: percentage of population that is of asian heritage (numeric - decimal)
# racePctHisp: percentage of population that is of hispanic heritage (numeric - decimal)
# whitePerCap: per capita income for caucasians (numeric - decimal)
# blackPerCap: per capita income for african americans (numeric - decimal)
# indianPerCap: per capita income for native americans (numeric - decimal)
# AsianPerCap: per capita income for people with asian heritage (numeric - decimal)
# OtherPerCap: per capita income for people with 'other' heritage (numeric - decimal)
# HispPerCap: per capita income for people with hispanic heritage (numeric - decimal)
 
race_variables = ['racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'whitePerCap', 'blackPerCap', 'indianPerCap',
                  'AsianPerCap', 'OtherPerCap', 'HispPerCap']

X_wo_race = X.drop(columns=race_variables)

In [None]:
# Lasso w/o race variables
grid = {
    'estimator__alpha': np.logspace(-3, 1, 100)
}

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Lasso())
])

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
cv_result = cross_validate(estimator=grid_search_cv, X=X_wo_race, y=y, cv=3, return_train_score=True)
test_score = cv_result['test_score']
train_score = cv_result['train_score']
plt.bar(x=range(2), 
        height = [np.mean(train_score), np.mean(test_score)], 
        tick_label=['Train', 'Test'], 
        yerr=[np.std(train_score), np.std(test_score)])
plt.title('Lasso CV')
plt.show()

grid_search_cv.fit(X=X_wo_race, y=y)
pred = grid_search_cv.predict(X=X_wo_race)
pred_df['Lasso_wo_race'] = pred

In [None]:
# Ridge w/o race variables
grid = {
    'estimator__alpha': np.logspace(-3, 2, 100)
}

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Ridge())
])

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
cv_result = cross_validate(grid_search_cv, X=X_wo_race, y=y, cv=3, return_train_score=True)
test_score = cv_result['test_score']
train_score = cv_result['train_score']
plt.bar(x=range(2), height = [np.mean(train_score), np.mean(test_score)], tick_label=['Train', 'Test'], 
        yerr=[np.std(train_score), np.std(test_score)])
plt.title('Ridge CV')
plt.show()

grid_search_cv.fit(X=X_wo_race, y=y)
pred = grid_search_cv.predict(X=X_wo_race)
pred_df['Ridge_wo_race'] = pred

In [None]:
# GBR
from sklearn.ensemble import GradientBoostingRegressor

grid = {
    # 'estimator__learning_rate': np.logspace(-2, 1, 3),
    'estimator__learning_rate': np.logspace(-2, -0.5, 3),
    'estimator__min_samples_split': np.logspace(-2, 0, 3),
    'estimator__max_depth': np.arange(1, 5, 2)
}

pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('standardise', StandardScaler()),
    ('estimator', GradientBoostingRegressor(random_state=42))
])

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
cv_result = cross_validate(estimator=grid_search_cv, X=X_wo_race, y=y, cv=3, return_train_score=True)
test_score = cv_result['test_score']
train_score = cv_result['train_score']
plt.bar(x=range(2), 
        height = [np.mean(train_score), np.mean(test_score)], 
        tick_label=['Train', 'Test'], 
        yerr=[np.std(train_score), np.std(test_score)])
plt.title('GBR CV')
plt.show()

grid_search_cv.fit(X=X_wo_race, y=y)
pred = grid_search_cv.predict(X=X_wo_race)
pred_df['GBR_wo_race'] = pred

In [None]:
pred_df

In [None]:
# Look at the top 10 predicted communities with each method
cities_df = df['communityname']
cities_df

In [None]:
top_50_index = y.sort_values(ascending=False)[:50].index

width = 0.1
fig, ax = plt.subplots()
fig.set_size_inches(10, 20)

plt.barh(y=np.arange(len(top_50_index)), width=y[top_50_index], height=width, label='Actual')

for idx, item in enumerate(pred_df.columns):
    plt.barh(y=np.arange(len(top_50_index)) + ((idx + 1 ) * width), 
             width=pred_df[item][top_50_index], 
             height=width,
             label=item)

ax.set_yticks(ticks=np.arange(len(top_50_index)) + 0.2,
              labels=cities_df[top_50_index])
ax.invert_yaxis()
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Let's look at the correlation between 'race' related variables and other variables
pd.options.display.max_rows = None
X.corr()[race_variables].drop(race_variables)