In [None]:
import pandas as pd

df = pd.read_csv(filepath_or_buffer='./communities.data', sep=',', header=None, na_values='?', keep_default_na=True)
column_names = ['state', 'county', 'community', 'communityname', 'fold', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'LandArea', 'PopDens', 'PctUsePubTrans', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'LemasPctOfficDrugUn', 'PolicBudgPerPop', 'ViolentCrimesPerPop']
df.columns = column_names
df

In [None]:
# Do Ridge, Lasso on all 122 variables (same as previously)
y = df['ViolentCrimesPerPop'].copy()
X = df.drop(
    columns=['state', 'county', 'community', 'communityname', 'fold', 'ViolentCrimesPerPop']
)

In [None]:
# We're going to compare the results of LASSO and Ridge regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


# Start by running nested cross-validation for LASSO
grid = {
    'estimator__alpha': np.logspace(-3, -1, 100)
}

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Lasso())
])

grid_search_cv_lasso = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)

cv_result_lasso = cross_validate(estimator=grid_search_cv_lasso, X=X, y=y, cv=3, return_train_score=True)

In [None]:
# Now do the same for Ridge
from sklearn.linear_model import Ridge

grid = {
    'estimator__alpha': np.logspace(-3, 2, 100)
}

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Ridge())
])

grid_search_cv_ridge = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
cv_result_ridge = cross_validate(grid_search_cv_ridge, X=X, y=y, cv=3, return_train_score=True)

In [None]:
# Compare our Outer test and train scores for both LASSO and Ridge
test_score_lasso = cv_result_lasso['test_score']
train_score_lasso = cv_result_lasso['train_score']
test_score_ridge = cv_result_ridge['test_score']
train_score_ridge = cv_result_ridge['train_score']
plt.bar(x=range(4), 
        height = [np.mean(train_score_lasso), np.mean(test_score_lasso), np.mean(train_score_ridge), np.mean(test_score_ridge)], 
        tick_label=['Train (Lasso)', 'Test (Lasso)', 'Train (Ridge)', 'Test (Ridge)'], 
        yerr=[np.std(train_score_lasso), np.std(test_score_lasso), np.std(train_score_ridge), np.std(test_score_ridge)])
plt.title('Outer CV Results')
plt.show()

In [None]:
# Now fit both estimators with all data
# And now fit the entire dataset
grid_search_cv_lasso.fit(X=X, y=y)
_ = grid_search_cv_ridge.fit(X=X, y=y)

In [None]:
# And compare the coefficients
coef_lasso = grid_search_cv_lasso.best_estimator_.named_steps.estimator.coef_
coef_ridge = grid_search_cv_ridge.best_estimator_.named_steps.estimator.coef_

In [None]:
# Compare the coefficients produced by Lasso vs Ridge
# NOTE: That Lasso shrinks some coefficients to 0, whilst Ridge does not
width = 0.4

fig, ax = plt.subplots()
fig.set_size_inches(10, 16)
ind = np.arange(len(coef_lasso))
ax.barh(ind, coef_lasso, width, label='Lasso')
ax.barh(ind + width, coef_ridge, width, label='Ridge')

ax.invert_yaxis()
ax.set_yticks(ticks=ind + width, labels=grid_search_cv_lasso.best_estimator_.named_steps.imputer.feature_names_in_)
ax.legend()
plt.tight_layout()
plt.show()