In [None]:
# read in the data

import pandas as pd

#from google.colab import files  # uncomment if using colab
#uploades = files.upload()  # uncomment if using colab
df = pd.read_csv(filepath_or_buffer='./communities.data', sep=',', header=None, na_values='?', keep_default_na=True)
column_names = ['state', 'county', 'community', 'communityname', 'fold', 'population', 'householdsize', 'racepctblack', 'racePctWhite', 'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29', 'agePct16t24', 'agePct65up', 'numbUrban', 'pctUrban', 'medIncome', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc', 'pctWSocSec', 'pctWPubAsst', 'pctWRetire', 'medFamInc', 'perCapInc', 'whitePerCap', 'blackPerCap', 'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap', 'NumUnderPov', 'PctPopUnderPov', 'PctLess9thGrade', 'PctNotHSGrad', 'PctBSorMore', 'PctUnemployed', 'PctEmploy', 'PctEmplManu', 'PctEmplProfServ', 'PctOccupManu', 'PctOccupMgmtProf', 'MalePctDivorce', 'MalePctNevMarr', 'FemalePctDiv', 'TotalPctDiv', 'PersPerFam', 'PctFam2Par', 'PctKids2Par', 'PctYoungKids2Par', 'PctTeen2Par', 'PctWorkMomYoungKids', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig', 'PctImmigRecent', 'PctImmigRec5', 'PctImmigRec8', 'PctImmigRec10', 'PctRecentImmig', 'PctRecImmig5', 'PctRecImmig8', 'PctRecImmig10', 'PctSpeakEnglOnly', 'PctNotSpeakEnglWell', 'PctLargHouseFam', 'PctLargHouseOccup', 'PersPerOccupHous', 'PersPerOwnOccHous', 'PersPerRentOccHous', 'PctPersOwnOccup', 'PctPersDenseHous', 'PctHousLess3BR', 'MedNumBR', 'HousVacant', 'PctHousOccup', 'PctHousOwnOcc', 'PctVacantBoarded', 'PctVacMore6Mos', 'MedYrHousBuilt', 'PctHousNoPhone', 'PctWOFullPlumb', 'OwnOccLowQuart', 'OwnOccMedVal', 'OwnOccHiQuart', 'RentLowQ', 'RentMedian', 'RentHighQ', 'MedRent', 'MedRentPctHousInc', 'MedOwnCostPctInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters', 'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85', 'PctSameState85', 'LemasSwornFT', 'LemasSwFTPerPop', 'LemasSwFTFieldOps', 'LemasSwFTFieldPerPop', 'LemasTotalReq', 'LemasTotReqPerPop', 'PolicReqPerOffic', 'PolicPerPop', 'RacialMatchCommPol', 'PctPolicWhite', 'PctPolicBlack', 'PctPolicHisp', 'PctPolicAsian', 'PctPolicMinor', 'OfficAssgnDrugUnits', 'NumKindsDrugsSeiz', 'PolicAveOTWorked', 'LandArea', 'PopDens', 'PctUsePubTrans', 'PolicCars', 'PolicOperBudg', 'LemasPctPolicOnPatr', 'LemasGangUnitDeploy', 'LemasPctOfficDrugUn', 'PolicBudgPerPop', 'ViolentCrimesPerPop']
df.columns = column_names
df

In [None]:
# sanity check - we should have 1994 rows and 128 columns

print(f'There are {len(df)} rows')
print(f'and {len(df.columns)} columns')

In [None]:
#drop useless variables/features in the data

y = df['ViolentCrimesPerPop'].copy()
X = df.drop(
    columns=['state', 'county', 'community', 'communityname', 'fold', 'ViolentCrimesPerPop']
)

In [None]:
# Now we'll take a quick look at the variables
# we'll use include='all' to show all columns, not just numeric ones
pd.options.display.max_columns = None
df.describe(include='all')

In [None]:
## Let's look at our dependent variable - ViolentCrimesPerPop
df['ViolentCrimesPerPop'].describe()

In [None]:
# Group 1: OLS
# Group 2: LASSO

# Task: break into two groups. Group 1 will present OLS results for the
# prediction question. Then group 1 will switch and present LASSO for the
# causal question. Group 2 argues for LASSO (prediction question) and 
# for OLS (causal question)

# Brief: The government needs advice on how to reduce crime in the country.

# They have a budget of $1million. 
# This means they have limitted resources and must be selective with how to
# use their resources. 

# The govt has two questions they want answered:

# A) Prediction Question: Which areas should they target with efforts to 
#     reduce crime?
#     Be clear about: the characteristics of the areas to target

# B) Causal Question: What interventions should they employ to reduce crime?


# They put out a tender for this policy advice.
# Two consulting firms compete to win the contract.

# Consulting firm one chose to use OLS for their model.
# Consulting firm two chose to use LASSO for their model.

# The judge will be the "govt" and they will choose to award the $1m contract to the 
# consulting firm who provides the "best" policy advice

# Your job: to convince the judge why your consulting firm's model is the best

# Details of what to present: argue how your model helps to solve the 
# a) prediction problem and b) a causal problem 
#    argue why your model is the best for these two purposes (and why other models are problematic)
#     you have permission to play devils advocate 
#  For prediction: 
#    1) Describe the model that you used (features you started out with and included in model)
#    2) How did you decide which variables to include in the model?
#    3) How did you create a prediction model (e.g. type of CV procedure? Gridsearch range?)
#    4) What are the in-sample and out of sample score for the prediction models
#    5) What are the 'important features' that come out of your model?
#    6) Explain how you will use the model predictions to solve violent crime
#    think about: how will you use prediction scores? Use feature importance
#     results? 
#  For causal estimation: 
#    1) Describe the model that you used (features you started out with and included in model)
#    2) How did you decide which variables to include in the causal model?
#    3) How did you create a causal model (did you regularise? why and why not 
#        e.g. sparse model?; two features that correlate differently with Treatment and similarly with oucome)?)
#    4) Explain how why your model is better for causal estimation (e.g. bias v variance)
#    5) Present the coefficient size of the variables
#    6) Advise the govt which interventions will reduce crime
#    7) Talk about potential problems with your conclusions/ recommendations (e.g. bias from omitted variables)

In [None]:
# Group 1: Run an OLS

# you will use a theory-based model (make up your own theory about 
# which are the important features - go into variable list and pick features)
# please pick roughly 20 features....I have picked 11 for you below

# step 1: create dataframes
y_ols = df['ViolentCrimesPerPop'].copy()
X_ols = df[['PolicBudgPerPop', 'NumStreet', 'PctNotSpeakEnglWell', 'PctFam2Par', 'PctPopUnderPov', 'perCapInc', 'agePct12t21',
        'agePct12t29', 'agePct16t24', 'agePct65up', 'LemasTotReqPerPop']].copy()







In [None]:
# Group 1: Run an OLS
# step 2: look at your dataframes: know which features you must clean
X_ols.describe()

In [None]:
# Group 1: Run an OLS
# step 3: Impute median values of missing X variables
# We use a pipeline because, later, when we do ML you need to partition 
# your data and you may only want to do things in one partition of the data 
# and not the other

# Let's create a ML pipeline where we fill these with median values
# NOTE: We'll also add a data standardization step to the pipeline
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Ordinary Least Square - all variables using sklearn
from sklearn.linear_model import LinearRegression

ols = LinearRegression()

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', LinearRegression())
])

In [None]:
# Group 1: Run an OLS
# step 4: run your regression
# Now do the fit on the whole dataset, and check the R-squared
pipe.fit(X=X_ols, y=y_ols)
# and check the goodness of the fit
# Note: this R-sq the in-sample R-sq
print('R-squared for OLS fit:')
pipe.score(X=X_ols, y=y_ols)


In [None]:
# Group 1: Run an OLS

# step 5: check the goodness of the fit out-of-sample

# Let's get an idea of how generalisable the OLS regression is
# Using train/test splitting
# Note: training OLS on 2/3 of the 'dropmiss' data then testing on 1/3 of the 'dropmiss' data

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

X_ols_train, X_ols_test, y_ols_train, y_ols_test = train_test_split(X_ols, y_ols, test_size=0.33, random_state=42)
pipe.fit(X=X_ols_train, y=y_ols_train)
R_squared_train = pipe.score(X=X_ols_train, y=y_ols_train)
R_squared_test = pipe.score(X=X_ols_test, y=y_ols_test)

plt.bar(x=[0,1], height=[R_squared_train, R_squared_test], tick_label=['Train', 'Test'])
plt.ylabel(r'$R^2$')
plt.title('test vs train scores')
plt.show()


print('R-squared for OLS fit - out-of-sample:')
pipe.score(X=X_ols_test, y=y_ols_test)

In [None]:
# Group 1: Run an OLS

# step 6: represent your results in a table and graph
# Show the feature importance (absolute coefficient values) for each independent variable
estimator = pipe.named_steps.estimator

coef_df = pd.DataFrame(data=[estimator.coef_], columns=pipe.named_steps.imputer.feature_names_in_, index=['OLS'])
coef_df 



In [None]:
# Group 1: Run an OLS
# step 6: represent your results in a table and graph

# And graphically
fig, ax = plt.subplots()
barh = plt.barh(y=range(len(X_ols.columns)), width=estimator.coef_)
ax.set_yticks(ticks=range(len(X_ols.columns)), labels=pipe.named_steps.imputer.feature_names_in_)
ax.invert_yaxis()
plt.show()

In [None]:
# Group 2: Run a LASSO

# Step 1: create dataframes
y_ml_lasso = df['ViolentCrimesPerPop'].copy()
X_ml_lasso = df.drop(
    columns=['state', 'county', 'community', 'communityname', 'fold', 'ViolentCrimesPerPop']
)


In [None]:
# Group 2: Run a LASSO
# step 2: look at your dataframes: know which features you must clean
X_ml_lasso.describe()

In [None]:
# Group 2: Run a LASSO

# step 3: Impute median values of missing X variables, standardise and set lasso
# We use a pipeline because, later, when we do ML you need to partition 
# your data and you may only want to do things in one partition of the data 
# and not the other

# Let's create a ML pipeline where we fill these with median values
# NOTE: We'll also add a data standardization step to the pipeline
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('standardise', StandardScaler()),
    ('estimator', Lasso())
])

In [None]:
# Group 2: Run a LASSO

# step 4: set your gridsearchCV values

grid = {
    'estimator__alpha': np.logspace(-3, 1, 100)
}


In [None]:
# Group 2: Run a LASSO

# step 5: Do CV: one outer fold (1/3 test; 2/3 train) and 3 inner folds

# For all the ML models, we're going to use GridSearchCV to optimise the hyperparameters
# Problem: How to ensure that the models produced in this way are generalisable/minimse overfitting?
# Answer: We'll split the whole dataset into training data and test data, run the GridSearchCV on the training data, 
# then look at the test vs train score
# If we're happy with this, then we can proceed to fitting the full dataset

X_lasso_train, X_lasso_test, y_lasso_train, y_lasso_test = train_test_split(X_ml_lasso, y_ml_lasso, test_size=0.33, random_state=42)
grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
grid_search_cv.fit(X=X_lasso_train, y=y_lasso_train)
R_squared_train = grid_search_cv.score(X=X_lasso_train, y=y_lasso_train)
R_squared_test = grid_search_cv.score(X=X_lasso_test, y=y_lasso_test)

plt.bar(x=[0,1], height=[R_squared_train, R_squared_test], tick_label=['Train', 'Test'])
plt.title('Lasso Test Score vs Train Score')
plt.show()

# Note if you wanted to do other types of CV such as nested or LOO
# see Week 5 code
# but essentially something like (instead of above code) for nestedCV

#....
# import matplotlib.pyplot as plt
# from sklearn.model_selection import cross_validate 
# scoring = ['r2', 'neg_mean_squared_error']

# grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3, return_train_score=True, scoring=scoring, 
#                               refit='neg_mean_squared_error')
# cv_result_cv = cross_validate(estimator=grid_search_cv, X=X, y=y, cv=3, return_train_score=True, return_estimator=True,
#                            scoring=scoring)

# # R^2
# # Note: these are the average scores across the 3 outer folds..using the 'best' lambda from each outer fold

# test_score = cv_result_cv['test_r2']
# train_score = cv_result_cv['train_r2']
# plt.bar(x=range(2), 
#         height = [np.mean(train_score), np.mean(test_score)], 
#         tick_label=['Train', 'Test'], 
#         yerr=[np.std(train_score), np.std(test_score)])
# plt.ylabel(r'$R^2$')
# plt.title('"Outer" cross-validation scores ($R^2$)')
# plt.show()

# # MSE
# # Note: these are the average scores across the 3 outer folds..using the 'best'  lambda from each outer fold

# test_score = cv_result_cv['test_neg_mean_squared_error']
# train_score = cv_result_cv['train_neg_mean_squared_error']
# plt.bar(x=range(2), 
#         height = np.multiply(-1, [np.mean(train_score), np.mean(test_score)]), 
#         tick_label=['Train', 'Test'], 
#         yerr=[np.std(train_score), np.std(test_score)])
# plt.ylabel(r'MSE')
# plt.title('"Outer" cross-validation scores (MSE)')
# plt.show()

In [None]:
# Group 2: Run a LASSO

# step 6: Fit the whole dataset
# Note: this re-runs the entire pipeline on the full data set.

# The idea is that the cross-validation step shows that
# the results from the pipeline in question are generalisable, 
# after which you can repeat the pipeline on the full data set

grid_search_cv = GridSearchCV(estimator=pipe, param_grid=grid, cv=3)
_ = grid_search_cv.fit(X=X_ml_lasso, y=y_ml_lasso)

In [None]:
# Group 2: Run a LASSO

# step 7: look at the coefficients chosen
non_zero_coef = grid_search_cv.best_estimator_.named_steps.estimator.coef_.nonzero()

non_zero_coef_df = pd.DataFrame(data=[grid_search_cv.best_estimator_.named_steps.estimator.coef_[non_zero_coef]],
                                columns=grid_search_cv.best_estimator_.named_steps.imputer.feature_names_in_[non_zero_coef], index=['Lasso'])
display(non_zero_coef_df)

fig, ax = plt.subplots()
barh = plt.barh(y=range(len(X_ml_lasso.columns[non_zero_coef])), 
                width=grid_search_cv.best_estimator_.named_steps.estimator.coef_[non_zero_coef])
ax.set_yticks(ticks=range(len(X_ml_lasso.columns[non_zero_coef])), 
              labels=grid_search_cv.best_estimator_.named_steps.imputer.feature_names_in_[non_zero_coef])
ax.invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# save your coefficients
coef_df = pd.concat([coef_df, 
                     pd.DataFrame(data=[grid_search_cv.best_estimator_.named_steps.estimator.coef_],
                                  columns=grid_search_cv.best_estimator_.named_steps.imputer.feature_names_in_,
                                  index=['Lasso',])]).fillna(0)

In [None]:
# Step 8: traditionally for causal estimation, we can do a post-ols
# for this exercise, we will simplify and skip this step

In [None]:
# Compare the coefficients produced by OLS vs Lasso
non_zero_coef = (coef_df != 0).any()
ols_lasso_coef_compare_df = coef_df[non_zero_coef[non_zero_coef].index.to_list()].copy()

ind = np.arange(len(ols_lasso_coef_compare_df.columns))
width = 0.4

fig, ax = plt.subplots()
fig.set_size_inches(10, 16)
ax.barh(ind, ols_lasso_coef_compare_df.loc['OLS'], width, label='OLS')
ax.barh(ind + width, ols_lasso_coef_compare_df.loc['Lasso'], width, label='Lasso')

ax.invert_yaxis()
ax.set_yticks(ticks=ind + width, labels=ols_lasso_coef_compare_df.columns)
ax.legend()
plt.tight_layout()
plt.show()