In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from os import sys
sys.path.append("../../../machine_learning_tools/")

In [3]:
import sklearn_utils as sklu
import pandas_ml as pdml
import seaborn_ml as sml
import sklearn_models as sklm
import feature_selection_utils as fsu
from tqdm.notebook import tqdm

In [4]:
import pandas as pd
pd.set_option("display.max_colwidth",1000)

# Loading the Dataset

In [5]:
#loading the dataset (alreaady drops nan falues)
target_name = "MEDV"
df_raw = sklu.load_boston()
df_raw

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [6]:
X,y = pdml.X_y(df_raw,target_name)

In [7]:
feature_names = pdml.feature_names(X)

# Part A


Empirically demonstrate that fitting linear regression with an intercept term is equivalent to (i)
fitting linear regression when centering Y and centering the columns of X, and (ii) fitting linear
regression when adding a column of ones to X


# i) Fitting linear regression with intercept

In [8]:
import sklearn.linear_model as linear_model

In [9]:
X,y = pdml.X_y(df_raw,target_name)
model = linear_model.LinearRegression()
model.fit(X,y)
sklm.coef_summary(feature_names,model=model)

Intercept: 36.45948838508978
CRIM:-0.10801135783679891
ZN:0.04642045836687701
INDUS:0.020558626367074447
CHAS:2.6867338193449433
NOX:-17.766611228300217
RM:3.809865206809205
AGE:0.00069222464035224
DIS:-1.4755668456002557
RAD:0.30604947898517204
TAX:-0.01233459391657523
PTRATIO:-0.9527472317072907
B:0.009311683273794072
LSTAT:-0.5247583778554871


# ii) Centering columns x and y

In [10]:
df = pdml.center_df(df_raw)
X,y = pdml.X_y(df,target_name)
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X,y)
sklm.coef_summary(feature_names,model=model)

Intercept: 0.0
CRIM:-0.10801135783679564
ZN:0.04642045836688165
INDUS:0.020558626367077348
CHAS:2.6867338193448913
NOX:-17.766611228300157
RM:3.809865206809213
AGE:0.000692224640342843
DIS:-1.4755668456002569
RAD:0.30604947898517065
TAX:-0.012334593916574356
PTRATIO:-0.952747231707288
B:0.009311683273793711
LSTAT:-0.5247583778554921


# iii) Adding all ones to X

In [11]:
df_ones = df_raw.copy()
df_ones["intercept_col"] = 1

X,y = pdml.X_y(df_ones,target_name)
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X,y)
sklm.coef_summary(list(X.columns),model=model)

Intercept: 0.0
CRIM:-0.10801135783679747
ZN:0.046420458366880774
INDUS:0.020558626367068834
CHAS:2.6867338193448447
NOX:-17.766611228300395
RM:3.8098652068092433
AGE:0.0006922246403427094
DIS:-1.4755668456002557
RAD:0.3060494789851776
TAX:-0.012334593916574252
PTRATIO:-0.952747231707294
B:0.009311683273794051
LSTAT:-0.524758377855485
intercept_col:36.459488385089884


Observation: The coefficients are all the same in each of the 3 scenarios

# Part B

Empirically demonstrate that the least squares solution has zero training error when p > n.


In [35]:
"""
Psueodocode: 
For n in [1:n_features)
    a) restrict the data table to n rows (so only n observations)
    b) Run the linear regression on the data
    c) Print the training error

"""

# n_features = pdml.n_features(
#     df,
#     target_name=target_name
# )

print(f"Training Error for values of n")
for n in range(1,10):
    df_restricted = df_raw.iloc[:n]
    model = linear_model.LinearRegression(fit_intercept=False)
    X,y = pdml.X_y(df_restricted,target_name)
    model.fit(X,y)
    training_error = sklu.MSE(y,model=model,X=X)
    print(f"{n}:{training_error} ")

Training Error for values of n
1:1.135959703518257e-28 
2:3.155443620884047e-29 
3:2.9030081312133234e-28 
4:2.8398992587956425e-28 
5:1.0546275131012465e-24 
6:2.751736164028142e-25 
7:3.1711306834564424e-26 
8:3.979077514807201e-25 
9:2.638077084803899e-25 


# demonstrate this for another dataset where p > n

In [36]:
X,y = sklu.random_regression_with_informative_features(
n_samples = 300,
n_features = 8000,
    return_true_coef = False
)

model.fit(X,y)
training_error = sklu.MSE(y,model=model,X=X)
training_error

1.5291887714346117e-24

# Part C

Empirically demonstrate the MSE Existence Theorem. Recall that the MSE Existence Theorem
states that there exists a value of λ for which MSE(XβˆRidge(λ)
) < MSE(XβˆLS).

In [37]:
import preprocessing_ml as preml
df_scaled = preml.scale_df(df,
            target_name=target_name,
            scaler = "StandardScaler",
            verbose = False)
df_scaled[target_name] = df[target_name]
df_scaled

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,-0.419782,0.284830,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459000,0.441052,-1.075562,1.467194
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.557160,-0.867883,-0.987329,-0.303094,0.441052,-0.492439,-0.932806
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.557160,-0.867883,-0.987329,-0.303094,0.396427,-1.208727,12.167194
3,-0.416750,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517,10.867194
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.511180,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501,13.667194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,-0.413229,-0.487722,0.115738,-0.272599,0.158124,0.439316,0.018673,-0.625796,-0.982843,-0.803212,1.176466,0.387217,-0.418147,-0.132806
502,-0.415249,-0.487722,0.115738,-0.272599,0.158124,-0.234548,0.288933,-0.716639,-0.982843,-0.803212,1.176466,0.441052,-0.500850,-1.932806
503,-0.413447,-0.487722,0.115738,-0.272599,0.158124,0.984960,0.797449,-0.773684,-0.982843,-0.803212,1.176466,0.441052,-0.983048,1.367194
504,-0.407764,-0.487722,0.115738,-0.272599,0.158124,0.725672,0.736996,-0.668437,-0.982843,-0.803212,1.176466,0.403225,-0.865302,-0.532806


In [67]:
(X_train,
X_test,
y_train,
y_test) = sklu.train_val_test_split(X,y,test_size=0.2)

In [68]:
model = sklm.LinearRegression()
model.fit(X_train,y_train)
mse_least_squares= sklu.MSE(y_test,model = model , X = X_test )
print(f"Least squares solution = {mse_least_squares}")

Least squares solution = 22.37197051553544


In [48]:
from tqdm.notebook import tqdm
import numpy as np

In [69]:
"""
Purpose: Want to show that there is always
 a ridge regression solution that is better than
 the least squared solution 
 
1) 

"""
mse_error_lamda = []
for l in tqdm(np.linspace(0.0001,10,100000)):
    model_ridge = sklm.Ridge(alpha=l)
    model_ridge.fit(X_train,y_train)
    mse_error = sklu.MSE(y_test,model = model_ridge , X = X_test )
    if mse_error < mse_least_squares:
        print(f"lambda = {l} had smaller mse error = {mse_error}")
        break
    mse_error_lamda.append(mse_error)
#print(f"For ridge with lamda = {model_ridge.alpha_}, mse_error ={mse_error}")

  0%|          | 0/100000 [00:00<?, ?it/s]

lambda = 0.0001 had smaller mse error = 22.288084478793664


In [None]:
"""
Observations: There was an instance where the 
ridge regression had a lower MSE than least squares solution

"""