### Import Library

In [28]:
import warnings
warnings.filterwarnings(action = 'ignore',category = FutureWarning)

import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

### Getting the data

In [29]:
file_path = './data/communities.names'

attribute_names = []

# Open the file and read lines
with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()
        if line.startswith('@attribute'):
            # Extract the attribute name
            parts = line.split()
            attribute_name = parts[1]
            attribute_names.append(attribute_name)
attribute_names

['state',
 'county',
 'community',
 'communityname',
 'fold',
 'population',
 'householdsize',
 'racepctblack',
 'racePctWhite',
 'racePctAsian',
 'racePctHisp',
 'agePct12t21',
 'agePct12t29',
 'agePct16t24',
 'agePct65up',
 'numbUrban',
 'pctUrban',
 'medIncome',
 'pctWWage',
 'pctWFarmSelf',
 'pctWInvInc',
 'pctWSocSec',
 'pctWPubAsst',
 'pctWRetire',
 'medFamInc',
 'perCapInc',
 'whitePerCap',
 'blackPerCap',
 'indianPerCap',
 'AsianPerCap',
 'OtherPerCap',
 'HispPerCap',
 'NumUnderPov',
 'PctPopUnderPov',
 'PctLess9thGrade',
 'PctNotHSGrad',
 'PctBSorMore',
 'PctUnemployed',
 'PctEmploy',
 'PctEmplManu',
 'PctEmplProfServ',
 'PctOccupManu',
 'PctOccupMgmtProf',
 'MalePctDivorce',
 'MalePctNevMarr',
 'FemalePctDiv',
 'TotalPctDiv',
 'PersPerFam',
 'PctFam2Par',
 'PctKids2Par',
 'PctYoungKids2Par',
 'PctTeen2Par',
 'PctWorkMomYoungKids',
 'PctWorkMom',
 'NumIlleg',
 'PctIlleg',
 'NumImmig',
 'PctImmigRecent',
 'PctImmigRec5',
 'PctImmigRec8',
 'PctImmigRec10',
 'PctRecentImmig',
 'P

In [30]:
data = pd.read_csv('./data/communities.data', header=None, na_values='?')

# adding column names
data.columns = attribute_names
data

Unnamed: 0,state,county,community,communityname,fold,population,householdsize,racepctblack,racePctWhite,racePctAsian,...,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop,ViolentCrimesPerPop
0,8,,,Lakewoodcity,1,0.19,0.33,0.02,0.90,0.12,...,0.12,0.26,0.20,0.06,0.04,0.90,0.5,0.32,0.14,0.20
1,53,,,Tukwilacity,1,0.00,0.16,0.12,0.74,0.45,...,0.02,0.12,0.45,,,,,0.00,,0.67
2,24,,,Aberdeentown,1,0.00,0.42,0.49,0.56,0.17,...,0.01,0.21,0.02,,,,,0.00,,0.43
3,34,5.0,81440.0,Willingborotownship,1,0.04,0.77,1.00,0.08,0.12,...,0.02,0.39,0.28,,,,,0.00,,0.12
4,42,95.0,6096.0,Bethlehemtownship,1,0.01,0.55,0.02,0.95,0.09,...,0.04,0.09,0.02,,,,,0.00,,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,12,,,TempleTerracecity,10,0.01,0.40,0.10,0.87,0.12,...,0.01,0.28,0.05,,,,,0.00,,0.09
1990,6,,,Seasidecity,10,0.05,0.96,0.46,0.28,0.83,...,0.02,0.37,0.20,,,,,0.00,,0.45
1991,9,9.0,80070.0,Waterburytown,10,0.16,0.37,0.25,0.69,0.04,...,0.08,0.32,0.18,0.08,0.06,0.78,0.0,0.91,0.28,0.23
1992,25,17.0,72600.0,Walthamcity,10,0.08,0.51,0.06,0.87,0.22,...,0.03,0.38,0.33,0.02,0.02,0.79,0.0,0.22,0.18,0.19


In [31]:
# create Y
y = data.iloc[:,127]
y

0       0.20
1       0.67
2       0.43
3       0.12
4       0.03
        ... 
1989    0.09
1990    0.45
1991    0.23
1992    0.19
1993    0.48
Name: ViolentCrimesPerPop, Length: 1994, dtype: float64

In [32]:
# delete first 5 columns
x = data.iloc[:, 5:127]

# check dataframe shape
x.shape

(1994, 122)

### Split Train/Test

In [33]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=13)

# check x_train shape
x_train.shape

(1794, 122)

### Imputing NAs, EDA

In [34]:
# call x_train head & fix the pd.read_csv
x_train.head()

Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PolicAveOTWorked,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop
227,0.1,0.33,0.07,0.84,0.04,0.13,0.34,0.49,0.3,0.54,...,0.18,0.02,0.7,0.18,0.06,0.01,0.84,0.5,1.0,0.1
329,0.04,0.64,0.02,0.2,1.0,0.67,0.4,0.53,0.35,0.42,...,,0.01,0.75,0.27,,,,,0.0,
745,0.02,0.23,0.38,0.55,0.55,0.07,0.38,0.79,0.58,0.17,...,,0.01,0.3,0.53,,,,,0.0,
491,0.02,0.37,1.0,0.09,0.02,0.01,0.46,0.45,0.27,0.61,...,,0.04,0.13,0.04,,,,,0.0,
19,0.0,0.41,0.05,0.96,0.01,0.01,0.37,0.37,0.24,0.55,...,,0.09,0.03,0.05,,,,,0.0,


In [35]:
# Impute NAs with the median value of the column in the training data
imputer = SimpleImputer(strategy='median')
train_data_imputed = imputer.fit_transform(x_train)
test_data_imputed = imputer.transform(x_test)
data_imputed = imputer.transform(x)

# convert them to dataframe
x_train = pd.DataFrame(train_data_imputed, columns=x_train.columns)
x_test = pd.DataFrame(test_data_imputed, columns=x_test.columns)
x = pd.DataFrame(data_imputed, columns=x.columns)
    
# checking
x_train.head()


Unnamed: 0,population,householdsize,racepctblack,racePctWhite,racePctAsian,racePctHisp,agePct12t21,agePct12t29,agePct16t24,agePct65up,...,PolicAveOTWorked,LandArea,PopDens,PctUsePubTrans,PolicCars,PolicOperBudg,LemasPctPolicOnPatr,LemasGangUnitDeploy,LemasPctOfficDrugUn,PolicBudgPerPop
0,0.1,0.33,0.07,0.84,0.04,0.13,0.34,0.49,0.3,0.54,...,0.18,0.02,0.7,0.18,0.06,0.01,0.84,0.5,1.0,0.1
1,0.04,0.64,0.02,0.2,1.0,0.67,0.4,0.53,0.35,0.42,...,0.25,0.01,0.75,0.27,0.08,0.03,0.75,0.5,0.0,0.15
2,0.02,0.23,0.38,0.55,0.55,0.07,0.38,0.79,0.58,0.17,...,0.25,0.01,0.3,0.53,0.08,0.03,0.75,0.5,0.0,0.15
3,0.02,0.37,1.0,0.09,0.02,0.01,0.46,0.45,0.27,0.61,...,0.25,0.04,0.13,0.04,0.08,0.03,0.75,0.5,0.0,0.15
4,0.0,0.41,0.05,0.96,0.01,0.01,0.37,0.37,0.24,0.55,...,0.25,0.09,0.03,0.05,0.08,0.03,0.75,0.5,0.0,0.15


In Sections 3 and 4 we will be performing regularized regression. Do we need to scale our variables to use these models? Why or why not? If yes, scale them using the scaler you find appropriate. If not, say why and move on. (5pts)


Ans: I think we need to scale the varibles to make the linear model converge better and faster. We use StandardScaler() to standarize it.

In [36]:
# create scaler
scaler = StandardScaler()
scaler.fit(x_train)

# transform data
scale_x_train = scaler.transform(x_train)
scale_x_test = scaler.transform(x_test)
scale_x = scaler.transform(x)

# convert to dataframe
x_train = pd.DataFrame(scale_x_train, columns=x_train.columns)
x_test = pd.DataFrame(scale_x_test, columns=x_test.columns)
x = pd.DataFrame(scale_x, columns=x.columns)

Compute the univariate correlation between target and each numeric predictor variable.
What are the 5 features with the weakest univariate correlations? What are the 5 features with the strongest univariate correlations?

In [37]:
# create correlation dataframe
correlation_df = pd.DataFrame(columns=['Predictor', 'Correlation'])

# get correlation pair for each predictor
for col in x_train.columns:
    correlation = np.corrcoef(x_train[col], y_train)[0,1]
    correlation_df = pd.concat([correlation_df, pd.DataFrame([[col, correlation]],columns=['Predictor', 'Correlation'])], ignore_index=True, axis=0)

# find the absolute value of correlation
correlation_df['correlation_abs'] = correlation_df['Correlation'].abs()
correlation_df = correlation_df.sort_values(by='correlation_abs')

# top 5 strongest & weakest
print('Weakest 5:\n', correlation_df.head(5)['Predictor'])
print('Strongest 5:\n', correlation_df.tail(5)['Predictor'])
correlation_df


Weakest 5:
 119    LemasGangUnitDeploy
75          PctVacMore6Mos
95          PctSameState85
1            householdsize
111      NumKindsDrugsSeiz
Name: Predictor, dtype: object
Strongest 5:
 45    PctYoungKids2Par
3         racePctWhite
43          PctFam2Par
50            PctIlleg
44         PctKids2Par
Name: Predictor, dtype: object


Unnamed: 0,Predictor,Correlation,correlation_abs
119,LemasGangUnitDeploy,0.000143,0.000143
75,PctVacMore6Mos,0.012351,0.012351
95,PctSameState85,-0.023219,0.023219
1,householdsize,-0.031036,0.031036
111,NumKindsDrugsSeiz,0.033370,0.033370
...,...,...,...
45,PctYoungKids2Par,-0.661423,0.661423
3,racePctWhite,-0.678336,0.678336
43,PctFam2Par,-0.701110,0.701110
50,PctIlleg,0.731454,0.731454


### Train a regression model with L1 regularization

In [38]:
cv = KFold(n_splits = 10, random_state = 13, shuffle = True)

In [39]:
# Fit a Lasso model with cross-validation
# Use the following values of α: [1e-3, 1e-2, 1e-1, 1, 10]
lasso_cv = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=10)

In [40]:
# fit the model and find best alpha
lasso_cv.fit(x_train, y_train)
selected_alpha = lasso_cv.alpha_

# What is the value of the best regularization constant?
selected_alpha

0.001

In [41]:
# get the model R2 score
y_pred = lasso_cv.predict(x_train)
r2 = r2_score(y_train, y_pred)

# What is the R2 returned by the selected model?
r2

0.677991958206577

Interpret it: R^2 score of the model is 0.68, it means that the model explains 68% of the variance in the target variable. The remaining 32% of the variance is unexplained and might be attributed to noise or factors not accounted for by the model.

In [42]:
# Lasso selects features by setting some of the coefficients to 0
lasso_cv.coef_

array([-0.        ,  0.00031644,  0.03956505, -0.00950632, -0.        ,
        0.        ,  0.        , -0.01768773,  0.        ,  0.        ,
       -0.00071802,  0.01286672,  0.        , -0.00579565,  0.00130288,
       -0.01930026,  0.00516705, -0.        , -0.01117275,  0.        ,
       -0.        , -0.00372088, -0.00146887, -0.00447634,  0.00389009,
        0.00747217,  0.00076424, -0.        , -0.01978345, -0.00252044,
        0.        , -0.        , -0.00137075,  0.00409538, -0.00576774,
        0.        ,  0.        , -0.        ,  0.01153099,  0.01009193,
       -0.        ,  0.        ,  0.        , -0.        , -0.05006087,
       -0.01224234, -0.        ,  0.        , -0.01606123, -0.0067497 ,
        0.03684423, -0.01422545, -0.        , -0.00096661, -0.        ,
       -0.        ,  0.        ,  0.        ,  0.00511439,  0.        ,
       -0.        , -0.00024845, -0.        , -0.        ,  0.0015301 ,
       -0.00174778,  0.        , -0.00238192,  0.02737815,  0.00

In [43]:
# find coefficient that is not 0
coef = [i for i, coe in enumerate(lasso_cv.coef_) if coe!=0]

# How many features were selected?
print(len(coef))

# Which features were they?
print(x_train.columns[coef])

64
Index(['householdsize', 'racepctblack', 'racePctWhite', 'agePct12t29',
       'numbUrban', 'pctUrban', 'pctWWage', 'pctWFarmSelf', 'pctWInvInc',
       'pctWSocSec', 'pctWRetire', 'whitePerCap', 'blackPerCap',
       'indianPerCap', 'AsianPerCap', 'OtherPerCap', 'HispPerCap',
       'PctPopUnderPov', 'PctLess9thGrade', 'PctUnemployed', 'PctEmploy',
       'PctEmplManu', 'MalePctDivorce', 'MalePctNevMarr', 'PctKids2Par',
       'PctYoungKids2Par', 'PctWorkMom', 'NumIlleg', 'PctIlleg', 'NumImmig',
       'PctImmigRec5', 'PctRecImmig8', 'PctNotSpeakEnglWell',
       'PersPerOccupHous', 'PersPerOwnOccHous', 'PctPersOwnOccup',
       'PctPersDenseHous', 'PctHousLess3BR', 'HousVacant', 'PctHousOccup',
       'PctVacantBoarded', 'PctVacMore6Mos', 'RentLowQ', 'MedRent',
       'MedRentPctHousInc', 'MedOwnCostPctIncNoMtg', 'NumInShelters',
       'NumStreet', 'PctForeignBorn', 'PctBornSameState', 'PctSameCity85',
       'LemasSwFTFieldPerPop', 'LemasTotalReq', 'PolicReqPerOffic',
       'Rac

'''
Write out the algebraic expression for the criterion that this model is minimizing.
Is this function differentiable in its parameters? If yes, provide the closed form of the estimator of the parameter vector. (5pts)
'''

- algebraic expression: (1 / (2 * n)) * Σ(yᵢ - ŷᵢ)² + α * Σ|βⱼ|
- Not differentiable

### Train a regression model with L2 regularization

In [44]:
# Fit a ridge regression model with cross-validation (RidgeCV).
# Use the following values of α: [1e-3, 1e-2, 1e-1, 1, 10].
ridge_cv = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=10)

In [45]:
ridge_cv.fit(x_train, y_train)
selected_alpha = ridge_cv.alpha_

# What is the value of the best regularization constant?
selected_alpha

10.0

In [46]:
y_pred = ridge_cv.predict(x_train)
r2 = r2_score(y_train, y_pred)

# What is the R2 returned by the selected model?
r2

0.6936307407214163

'''Write out the algebraic expression for the criterion that this model is minimizing. Is this function differentiable in its parameters? If yes, provide the closed form of the estimator of the parameter vector.'''

- expression: (1 / (2 * n)) * Σ(yᵢ - ŷᵢ)² + α * Σ(βⱼ²)
- It is differentiable, parameter vector: β̂(ridge) = (XᵀX + αI)⁻¹Xᵀy

'''Between the L1 and L2-penalized models, which is simpler? Why is this the case?'''

L1 is simpler, since it has feature selections, simplifing the model

### Train a random forest

Train many random forest models using RandomizedSearchCV to find the best hyperparameter values. Use n_iter = 100

In [47]:
# create random forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# set the possible hyperparameter values
param_dist = {
    'min_samples_leaf': [int(x) for x in np.linspace(1, 61, num = 10)],
    'max_features': [float(x) for x in np.linspace(.1, 1, num = 10)],
    'max_depth': [int(x) for x in np.linspace(6, 20, num = 10)]
}

# train the model
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1)
random_search.fit(x_train, y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_

Which hyperparameter values were chosen? What do they tell us about our model?

In [48]:
# find the best hyperparameter
best_params = random_search.best_params_
best_params

{'min_samples_leaf': 7, 'max_features': 0.2, 'max_depth': 18}

- In each leaf node of the decision trees in the random forest, there should be at least 7 samples (data points).
- Only a random subset of 20% of the available features will be considered for splitting at each node
- Maximum depth of each tree in the random forest is limited to 18 levels

In [49]:
y_pred = random_search.predict(x_train)
r2 = r2_score(y_train, y_pred)

# What is the R2 returned by the selected model?
r2

0.8270238539156115

### Model Selection

What is the statistic/metric that the cross-validation function is using to evaluate models? Which is the best model based on only that cross-validation criterion?

- R square
- RandomForest has the highest r square score

Of the 3 CV-selected models, which is the best in terms of simplicity and interpretability?

- Lasso

Are the scores between these two models comparable? Make a final suggestion for which model to use and justify why you chose this model.

- Yes, I think they are comparable
- Choose Random Forest because it has highest r square

Does the model include the 5 features with the strongest univariate correlations with the target?

- Yes, they are all included

In [50]:
# find the features that best model includes
best_model = random_search.best_estimator_
best_model.feature_importances_

array([2.30837197e-03, 1.35163022e-03, 1.67172578e-02, 8.80041607e-02,
       1.59589655e-03, 9.26617900e-03, 2.53373905e-03, 1.89125057e-03,
       1.72776609e-03, 1.57625343e-03, 4.86836386e-03, 1.67897104e-03,
       2.01342209e-03, 2.30170229e-03, 2.00667123e-03, 2.04857434e-02,
       2.78485393e-03, 1.06335843e-02, 1.73120849e-03, 2.17786132e-03,
       1.50082241e-03, 2.43372495e-03, 2.68725261e-03, 1.67823716e-03,
       2.42727214e-03, 2.25646456e-03, 2.73066833e-03, 1.46924312e-02,
       1.02005561e-02, 2.84780324e-03, 2.73689059e-03, 2.03205653e-03,
       3.37711584e-03, 1.54438056e-03, 2.43857802e-03, 2.12262960e-03,
       2.80309478e-03, 3.04277181e-03, 8.63559689e-03, 2.68892217e-03,
       1.23606403e-02, 1.74983213e-02, 1.64851167e-03, 1.04766666e-01,
       1.26602360e-01, 3.24787664e-02, 2.95137815e-02, 2.66105949e-03,
       2.61376967e-03, 7.30498459e-02, 1.64981728e-01, 3.91446857e-03,
       2.30143805e-03, 2.01417981e-03, 2.38580289e-03, 1.89962643e-03,
      

In [51]:
# to see if the model includes the strongest 5 features
strong5 = list(correlation_df.tail(5).index)
for i in strong5:
    if best_model.feature_importances_[i]!= 0:
        print('included')

included
included
included
included
included


### Evaluation

Evaluate the final model on test data and give an estimate of R2. Do we think this estimate is optimistic or not? Why?

In [52]:
y_pred = random_search.predict(x_test)
r2 = r2_score(y_test, y_pred)

r2

0.7344689164731175

- I think it is not optimistic because it accounts for variations in the training and validation subsets.

Repeat the whole process of [ split / train / test ] 100 times with different seeds to get a confidence interval for the true value of R2

In [53]:
r2_scores = []

for _ in range(100):
    # split
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=np.random.randint(1, 1000))

    # train
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=18, min_samples_leaf=7, max_features=.2)
    rf_regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_regressor.predict(X_test)

    # calculate R2 and append it to the list
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

# mean and standard deviation of the R2 scores
mean_r2 = np.mean(r2_scores)
std_r2 = np.std(r2_scores)

# get the confidence interval
confidence_interval = (mean_r2 - 1.96 * std_r2, mean_r2 + 1.96 * std_r2)

In [54]:
confidence_interval

(0.6055328695895266, 0.7192385099463088)