In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score

In [3]:

# Read the data
ames = pd.read_csv("AmesHousing.csv")

# Get rid of columns with mostly NaN values
good_cols = ames.isna().sum() < 100
ames = ames.loc[:,good_cols]

# Drop other NAs
ames = ames.dropna()

In [48]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lr_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

In [5]:
cross_val_score(lr_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([-2.59303720e+21, -1.13145211e+19, -7.57138616e+20, -4.47669752e+18,
       -2.55949915e+20])

In [50]:
#fit best model and find coeff

best_model1 = lr_pipeline_1.fit(X, y)
best_model1.named_steps['linear_regression'].coef_


array([-2.24179963e+13, -2.24179963e+13, -2.24179963e+13, -2.24179963e+13,
       -2.24179963e+13, -2.24179963e+13,  8.77445489e+16,  8.77445489e+16,
       -5.69655063e+16, -5.69655063e+16, -5.69655063e+16, -5.69655063e+16,
       -7.15659594e+16, -7.15659594e+16, -7.15659594e+16, -7.15659594e+16,
       -1.51538003e+15, -1.51538003e+15, -1.51538003e+15, -9.19114245e+16,
       -9.19114245e+16, -9.19114245e+16, -9.19114245e+16, -9.19114245e+16,
        1.15456556e+17,  1.15456556e+17,  1.15456556e+17,  7.79072920e+15,
        7.79072920e+15,  7.79072920e+15,  7.79072920e+15,  7.79072920e+15,
        7.79072920e+15,  7.79072920e+15,  7.79072920e+15,  7.79072920e+15,
        7.79072920e+15,  7.79072920e+15,  7.79072920e+15,  7.79072920e+15,
        7.79072920e+15,  7.79072920e+15,  7.79072920e+15,  7.79072920e+15,
        7.79072920e+15,  7.79072920e+15,  7.79072920e+15,  7.79072920e+15,
        7.79072920e+15,  7.79072920e+15,  7.79072920e+15,  7.79072920e+15,
        7.79072920e+15,  


Practice Activity

Make a pipeline that uses all the variables in the Ames dataset, and then fits Ridge Regression with

Cross-validate this pipeline and compare the results to the ordinary linear regression.

Then fit the model on the whole dataset and get the coefficients. Make a plot of these coefficients compared to the ones from ordinary linear regression

In [36]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

ridge_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge", Ridge(alpha=1))]
)

In [10]:
cross_val_score(ridge_pipeline_1, X, y, cv = 5, scoring = 'r2')

array([0.89815807, 0.91744024, 0.79493606, 0.78522563, 0.91389818])

TUNING

In [37]:
from sklearn.model_selection import GridSearchCV

lambdas = {"ridge__alpha": [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(ridge_pipeline_1, lambdas, cv = 5, scoring = 'r2')

gscv_fitted = grid.fit(X, y)

In [38]:
gscv_fitted.cv_results_['params']
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])
results_df = params_df.assign(score = gscv_fitted.cv_results_['mean_test_score'])
results_df.sort_values(by = 'score', ascending = False)

Unnamed: 0,ridge__alpha,score
3,10.0,0.864272
2,1.0,0.861932
4,100.0,0.857773
1,0.1,0.856302
0,0.01,0.854186


In [54]:
ridge_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("ridge", Ridge(alpha=10))]
)

best_model2 = ridge_pipeline_1.fit(X, y)

In [56]:
best_model2.named_steps['ridge'].coef_

array([-4.90649460e+03,  2.34694314e+03, -1.22917740e+03,  4.62091189e+03,
        2.20428500e+03, -3.03646802e+03, -6.17531422e+03,  6.17531422e+03,
        2.12575125e+03,  7.53147205e+03, -1.22865537e+04,  2.62933036e+03,
       -1.10363100e+04,  9.09887001e+03, -1.49378054e+03,  3.43122056e+03,
        2.93687893e+03, -2.42162664e+03, -5.15252289e+02,  1.43157683e+02,
        7.56730650e+03, -5.62761832e+03, -2.75896133e+03,  6.76115464e+02,
       -3.78227675e+02,  5.60816320e+03, -5.22993552e+03, -3.76824510e+03,
        2.45905383e+03,  5.70332014e+03, -6.25300540e+02, -4.25100049e+03,
       -8.51682634e+03,  9.07661139e+03, -1.55819272e+04, -1.11163614e+04,
        1.72302347e+03,  1.23832549e+04, -6.48871508e+03, -3.70052362e+02,
        1.59468710e+03, -1.01764141e+04, -1.10938115e+04,  4.18711022e+03,
       -1.32553889e+04,  2.75168983e+04,  2.20925274e+04, -9.78994038e+03,
       -7.01890866e+03, -6.59706874e+03, -8.97446647e+03,  9.31639420e+03,
        3.08242659e+04, -

## Lasso


In [41]:
X = ames.drop(["SalePrice", "Order", "PID"], axis = 1)
y = ames["SalePrice"]


ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

lasso_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso", Lasso(alpha=1, max_iter = 10000))]
)

In [26]:
lambdas = {"lasso__alpha": [0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(lasso_pipeline_1, lambdas, cv = 5, scoring = 'r2')

grid_fitted = gscv_fitted = grid.fit(X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [24]:
gscv_fitted.cv_results_['params']
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])
results_df = params_df.assign(score = gscv_fitted.cv_results_['mean_test_score'])
results_df.sort_values(by = 'score', ascending = False)

Unnamed: 0,lasso__alpha,score
4,100.0,0.866931
3,10.0,0.860632
2,1.0,0.857152
1,0.1,0.856618
0,0.01,0.855606


In [52]:
lasso_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("lasso", Lasso(alpha=100, max_iter = 10000))]
)


best_model3 = lasso_pipeline_1.fit(X, y)

In [53]:
best_model3.named_steps['lasso'].coef_

array([-0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -3.40179046e+03, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  2.91561103e+03, -0.00000000e+00,  0.00000000e+00,
       -1.02299521e+04,  5.27128450e+03, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        5.35462663e+03, -2.63928711e+03, -0.00000000e+00,  1.77446540e+02,
       -0.00000000e+00,  2.11317738e+03, -0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  2.14107952e+03,  0.00000000e+00,
       -0.00000000e+00,  1.41381624e+04, -7.39480667e+03, -1.20939271e+03,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -6.88307419e+02, -3.10766828e+03,  0.00000000e+00,
       -4.43798790e+03,  3.63801042e+04,  2.61893814e+04, -2.59146809e+03,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  1.40936035e+04,
        3.79963928e+04,  

## Elastic net

In [29]:
ct = ColumnTransformer(
  [
    ("dummify",
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'),
    make_column_selector(dtype_include=object)),
    ("standardize",
    StandardScaler(),
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

elastic_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("elasticnet", ElasticNet(alpha=1, l1_ratio=0.5))]
)

In [32]:
#iterate through alphas and l1ratio

lambdas = {
    "elasticnet__alpha": [0.01, 0.1, 1, 10, 100],
    "elasticnet__l1_ratio": [0.10, 0.30, 0.5, 0.70,0.9]
}

grid = GridSearchCV(elastic_pipeline_1, lambdas, cv = 5, scoring = 'r2')

grid_fitted = gscv_fitted = grid.fit(X, y)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [33]:
gscv_fitted.cv_results_['params']
params_df = pd.DataFrame(gscv_fitted.cv_results_['params'])
results_df = params_df.assign(score = gscv_fitted.cv_results_['mean_test_score'])
results_df.sort_values(by = 'score', ascending = False)

Unnamed: 0,elasticnet__alpha,elasticnet__l1_ratio,score
2,0.01,0.5,0.864268
3,0.01,0.7,0.864157
1,0.01,0.3,0.864119
0,0.01,0.1,0.863848
9,0.1,0.9,0.863687
4,0.01,0.9,0.863194
8,0.1,0.7,0.860003
7,0.1,0.5,0.856987
6,0.1,0.3,0.854568
5,0.1,0.1,0.852539


In [57]:

elastic_pipeline_1 = Pipeline(
  [("preprocessing", ct),
  ("elasticnet", ElasticNet(alpha=.01, l1_ratio=0.5))]
)

best_model4 = elastic_pipeline_1.fit(X, y)

  model = cd_fast.enet_coordinate_descent(


In [58]:
best_model4.named_steps['elasticnet'].coef_

array([-4.36707316e+03,  2.58930223e+03, -9.19417357e+02,  3.85391906e+03,
        2.21693926e+03, -3.37367006e+03, -5.42672957e+03,  5.42672956e+03,
        1.86723305e+03,  6.98591112e+03, -1.10890057e+04,  2.23386196e+03,
       -1.09990714e+04,  8.99233117e+03, -1.25763948e+03,  3.26437977e+03,
        2.26595098e+03, -1.82550578e+03, -4.39445198e+02, -1.94736847e+01,
        7.32006891e+03, -5.43410203e+03, -2.44365652e+03,  5.78163411e+02,
       -6.16362027e+02,  5.18209052e+03, -4.56472859e+03, -3.87569981e+03,
        1.90088683e+03,  4.70492586e+03,  3.00238837e+02, -3.50912503e+03,
       -8.19981053e+03,  9.22438676e+03, -1.44898572e+04, -1.06699450e+04,
        1.14082112e+03,  9.02194540e+03, -5.36482423e+03, -2.84779674e+02,
        1.44588571e+03, -9.23421603e+03, -1.01670815e+04,  3.40782016e+03,
       -1.23392127e+04,  2.60565188e+04,  2.09499486e+04, -8.52307157e+03,
       -5.88863748e+03, -5.64447620e+03, -8.43567282e+03,  8.49411921e+03,
        2.82637493e+04, -