In [134]:
import pandas as pd
import numpy as np


In [135]:
df = pd.read_csv('/Users/aoifeduna/AoifeRepo/aoiferepo/Lectures/Unit3/data/housing.csv')

In [136]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

In [137]:
lreg = LinearRegression()
X = df.drop('PRICE', axis=1)
y = df['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

In [138]:
cross_val_score(estimator=lreg, X=X_train, y=y_train, cv=10)
# Estimator - whatever the algorithm that you initialize
# Any sort of predictor. Doesn't have to be from scikitlearn
# This will ALWAYS be called on your training set
# You don't cross-validate your test set
# CV is an interesting argument; how many folds you're going to use. Typically this is 10


array([0.49945536, 0.62090992, 0.68073943, 0.66394911, 0.3934969 ,
       0.72625023, 0.77370567, 0.79414459, 0.83785026, 0.78381453])

In [139]:
# Validation score is a proxy for how you're going to do on future results

In [140]:
lreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [141]:
lreg.coef_

array([-1.06957244e-01,  4.45558546e-02,  2.37053116e-02,  2.23742166e+00,
       -1.68583938e+01,  3.87986730e+00,  5.46111202e-05, -1.37954394e+00,
        3.04718275e-01, -1.21659387e-02, -9.14169249e-01,  9.14771590e-03,
       -5.27828497e-01])

In [142]:
y_train - lreg.predict(X_train)

215    0.354143
191    0.036086
107   -0.393596
442   -0.591241
230   -0.212520
         ...   
195    9.118376
118    0.076777
323   -1.133564
392   -0.280273
352    1.134491
Name: PRICE, Length: 404, dtype: float64

In [143]:
np.sum((y_train - lreg.predict(X_train))**2)

9089.097809228135

In [144]:
np.abs(lreg.coef_).sum()
# This is the L1 penalty

26.298529417494624

In [145]:
(lreg.coef_**2).sum()
# This is the L2 penalty

307.38938992837956

In [146]:
alpha = 1000

In [147]:
alpha*(lreg.coef_**2).sum()
# With the value of alpha, this coefficient outweighs another

307389.38992837956

In [148]:
2.5**2

6.25

In [149]:
3.5**2-2.5**2
# Six times larger

6.0

In [150]:
100((3.5**2)-(2.5**2))
# The way regularization works is that increasing the coefficient from 2.5 to 3.5
# would have to reduce the number by 600 in order to have an impact

TypeError: 'int' object is not callable

In [151]:
# The bigger the value of alpha, the stronger improvements you have to make in order for the coefficient to grow bigger
# The larger of alpha, the bigger the improvement you have to see in your model to justify the coefficient


In [152]:
alpha = 10000
alpha*(lreg.coef_**2).sum()
# The larger the value of alpha is, the bigger the penalty is for increasing the size of your weights
# If alpha were extremely large, what's the easiest way to reduce the cost of your model
# The larger alpha gets, where do your coefficients naturally trend towards? 0.
# If your model is overfitting, your coefficients are artificially high

3073893.8992837956

In [153]:
from sklearn.linear_model import Ridge, Lasso

In [154]:
ridge, lasso = Ridge(), Lasso()

In [155]:
ridge

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [156]:
ridge.alpha=10000

In [157]:
ridge.fit(X_train, y_train)

Ridge(alpha=10000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [158]:
ridge.coef_
# The bigger you make alpha the smaller the numbers become
# None of these are exactly 0 though

array([-7.05534222e-02,  5.74457939e-02, -5.59530041e-02,  1.09558337e-02,
       -4.21703719e-04,  8.85516842e-02,  2.06349886e-02, -8.96007975e-02,
        1.19815324e-01, -1.37115961e-02, -1.43617695e-01,  8.35050952e-03,
       -4.74676136e-01])

In [159]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [160]:
lasso.coef_
# There are 0s!
# And they've also gotten smaller

array([-0.06092354,  0.04545304, -0.        ,  0.        , -0.        ,
        1.05001025,  0.02287044, -0.56125924,  0.25591408, -0.01475951,
       -0.69776051,  0.00842163, -0.76068136])

In [162]:
np.logspace(3,3,7)

array([1000., 1000., 1000., 1000., 1000., 1000., 1000.])

In [163]:
import numpy as np
alphas = np.logspace(-3,3,7)
#Start at ten to the negative three, got to ten to the third, and have seven increments

In [164]:
cv_scores = []

for alpha in alphas:
    ridge.set_params(alpha = alpha)
    # For the value you in the list, set alpha equal to that number
    scores = cross_val_score(estimator=ridge, X=X_train, y=y_train, cv=10)
    cv_scores.append((np.mean(scores), alpha))

In [165]:
cv_scores
# These are our corresponding validation scores
# The difference is fairly miniscule

[(0.6774302782216554, 0.001),
 (0.677417648176563, 0.01),
 (0.6772318822455186, 0.1),
 (0.6743371883099584, 1.0),
 (0.6710963929835836, 10.0),
 (0.6677077597248793, 100.0),
 (0.6276714474010422, 1000.0)]

In [166]:
max(cv_scores)
# The ideal value of alpha is 0.001
# This means we're just pretty close to linear regression
# With data sets that have weird values, higher scores of value produce higher results

(0.6774302782216554, 0.001)

In [168]:
from sklearn.preprocessing import StandardScaler
# Subtracting from the mean, standardizing... this does it for you automatically

sc = StandardScaler()

In [172]:
sc.fit_transform(X_train)
# The fit means we're storing the average and the standard deviation values of the training set

array([[-0.41174958, -0.48041482, -0.07436025, ...,  0.08173983,
         0.37599244, -0.44009701],
       [-0.42850904,  1.46175584, -1.10728653, ..., -1.47767444,
         0.32970292, -1.11493712],
       [-0.42044068, -0.48041482, -0.36762464, ...,  1.13663771,
         0.30584965,  0.21215431],
       ...,
       [-0.40058181, -0.48041482, -0.53809359, ...,  0.54039108,
         0.34647107, -0.11961855],
       [ 1.06773675, -0.48041482,  1.01057351, ...,  0.81558184,
         0.41460641,  1.84842981],
       [-0.42807611,  2.10914606, -1.36010066, ..., -0.05585555,
         0.36064133, -0.67727931]])

In [174]:
sc.transform(X_test)
# The transform method scales off of the saved values in the X_train sc
# As part of the pre-processing module you have a built in way of doing this
# Transforming the test set based off the training set

array([[ 1.43961437, -0.48041482,  1.01057351, ...,  0.81558184,
        -2.15422575,  1.01546817],
       [-0.41193809,  0.46909083, -0.75768071, ...,  0.31106546,
         0.16946089, -0.34409465],
       [ 0.85174637, -0.48041482,  1.01057351, ...,  0.81558184,
        -0.27902787,  2.45409175],
       ...,
       [-0.41070171,  0.46909083, -0.75768071, ...,  0.31106546,
         0.12635968, -0.01232179],
       [ 3.28801671, -0.48041482,  1.01057351, ...,  0.81558184,
        -1.78095746,  1.05782215],
       [-0.43156555, -0.48041482,  0.40237496, ..., -0.92729293,
         0.36619135,  0.13026995]])

In [177]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(sc, ridge)
# You can take one of your models and make them the very last step in your pipeline

In [178]:
pipe.fit(X_train, y_train)
# You're not just fitting ridge, you're fitting all this stuff

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge',
                 Ridge(alpha=1000.0, copy_X=True, fit_intercept=True,
                       max_iter=None, normalize=False, random_state=None,
                       solver='auto', tol=0.001))],
         verbose=False)

In [180]:
pipe.predict(X_test)
# All of the preprocessing steps will be performed here
# A lot of the data encoding that you use can be encapsulated into a single function call
# Where your job ends is with having a deployable pipeline that someone else can use

array([18.61264065, 23.89790735, 15.82072225, 28.75573419, 24.58108404,
       21.64373018, 25.77068315, 20.98322869, 17.58047018, 26.54314078,
       26.44676788, 18.5774517 , 24.03534795, 29.14314292, 17.03786231,
       22.40026083, 16.51113464, 24.27901034, 24.33089874, 20.40845662,
       20.0796294 , 27.94456949, 21.51710373, 26.56935474, 13.47772384,
       13.55908443, 24.76264396, 17.52142434, 27.26478773, 16.04421386,
       24.35386863, 26.65069337,  8.93148912, 23.99402339, 21.56455087,
       21.27218598, 22.69843435, 26.01496891, 20.6671905 , 19.93064201,
       24.21619577, 22.95094889, 18.48129859, 28.73518767, 23.5413658 ,
       20.55739241, 25.69356749, 17.02217011, 24.79941937, 28.51961884,
       27.91167077, 26.80403418, 21.51148124, 17.70720884, 21.04360456,
       20.38147824, 26.75038983, 16.77632644, 14.98816647, 26.99365064,
       26.16070614, 22.6521418 , 24.98754039, 31.35443201, 29.0090383 ,
       25.23111745, 20.87503055, 13.22944049, 16.59776031, 22.50