In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets ## imports datasets from scikit-learn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load Datasets

In [None]:
data = datasets.load_boston() ## loads Boston dataset from datasets library 

df = pd.DataFrame(data.data, columns=data.feature_names)
target = pd.DataFrame(data.target, columns=["MEDV"])
# Concatenate y in the dataframe
df_target = pd.concat([df,target], axis=1)

# Some Exploration

## 1. When variables are centered, the intercept term is always 0

In [None]:
# This is to show that when all variables are centered, the intercept beta_0 is always 0.
df_centered = df_target - df_target.mean(axis=0)
model = smf.ols(formula='MEDV ~ CHAS + RM + AGE', data=df_centered).fit()
model.summary()

## 2. Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
data = pd.DataFrame([[0, 0], [0, 0], [1, 1], [1, 1]],dtype=np.float64)
scaler = StandardScaler() # call the scaler function
scaler.fit(data) # use the scaler to fit the data
print('mean of data', scaler.mean_, 'variance of data', scaler.var_)

data_scaled = scaler.transform(data) # use transform to obtain the scaled data
print('scaled data is\n', data_scaled)

print(scaler.transform([[2, 2]]))   # Now we can transform any data points

In [None]:
data

# Lasso and Ridge

## 1. Without Cross-validation

### 1.1 Lasso (use sklearn package) 

The objective for Lasso is: 
$\frac{1}{2n}||y - X\beta||^2_2 + \alpha * ||\beta||_1$

In [None]:
# First step, transformation
from sklearn.preprocessing import StandardScaler
y = target
df_dummy = pd.get_dummies(df, columns = ['CHAS'],drop_first = True) # Change categorical to one-hot
X = df_dummy.drop(columns=['ZN','INDUS','NOX','RAD','AGE','PTRATIO','B']) # we don't include these variables in the model
scaler_x, scaler_y = StandardScaler(), StandardScaler()
scaler_x.fit(X)
scaler_y.fit(y)
X = scaler_x.transform(X)
y = scaler_y.transform(y)

In [None]:
# Lasso
from sklearn.linear_model import Lasso      # Runs Lasso with a given parameter
model = Lasso(alpha=0.15)                    # Here alpha is like lambda in our slide
model.fit(X, y)

In [None]:
print(model.coef_)         # beta_1, beta_2,...,beta_6
print(model.intercept_)    # beta_0 (we see that with the transformation, beta_0 = 0)
print(model.score(X,y))    # R^2
y_pred = model.predict(X)  # Predicting y given X
print(y_pred[:5])

### 1.2 Ridge (sklearn)

The objective for ridge is: 
$||y - X\beta||^2_2 + \alpha * ||\beta||^2_2$

In [None]:
# Ridge
from sklearn.linear_model import Ridge      # Runs Ridge with a given parameter
model = Ridge(alpha=0.15*(2*len(y)))        # Here alpha is like lambda in our slide
model.fit(X, y)

In [None]:
print(model.coef_)         # beta_1, beta_2,...,beta_6
print(model.intercept_)    # beta_0
print(model.score(X,y))    # R^2
y_pred = model.predict(X)
y_pred = model.predict(X)  # Predicting y given X
print(y_pred[:5])

### 1.3 Lasso and Ridge using statsmodels package

#### Full model

In [None]:
import statsmodels.api as sm
model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
model.summary()

#### lasso

The objective for this package is:
    $\frac{1}{2n}||y - X\beta||^2_2 +\alpha\left(0.5*(1−𝐿1\_𝑤𝑡)∗||\beta||^2_2+𝐿1\_𝑤𝑡∗||\beta||_1\right)$

In [None]:
model = sm.OLS(y, X).fit_regularized(alpha=0.15, L1_wt=1.0) # Lasso
print(model.params)

#### Ridge

In [None]:
model = sm.OLS(y, X).fit_regularized(alpha=0.15*(2), L1_wt=0)  # Ridge (Need to pay attention to the built-in objective)
print(model.params)

## 2. Lasso and Ridge with Cross-Validation

### 2.1 Split data in Kfold

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5) # do not shuffle
print(kf)

In [None]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True) # shuffle
print(kf)

In [None]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

We don't need to use the above K-fold function for cross-validation. The built-in function already does this for us

### 2.2 We can use the builtin Package cross_val_score to evaluate the model

In [None]:
from sklearn.model_selection import cross_val_score
model = Lasso(alpha=0.15)                    # Here alpha is like lambda in our slide
print(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
# Output the cross validation score for each fold

In [None]:
from sklearn.model_selection import cross_val_score
model = Ridge(alpha=0.15*(2*len(y)))                    # Here alpha is like lambda in our slide
print(cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error'))
# Output the cross validation score for each fold

In [None]:
### 2.3 Grid Search and CV search for the best lambda

In [None]:
# First step, transformation
from sklearn.preprocessing import StandardScaler
y = target
df_dummy = pd.get_dummies(df, columns = ['CHAS'],drop_first = True) # Change categorical to one-hot
X = df_dummy
scaler_x, scaler_y = StandardScaler(), StandardScaler()
scaler_x.fit(X)
scaler_y.fit(y)
X = scaler_x.transform(X)
y = scaler_y.transform(y)

In [None]:
from sklearn.model_selection import GridSearchCV
model = Lasso()
alphas = np.logspace(-10, 10, 100)

tuned_parameters = [{'alpha': alphas}]
n_folds = 5

clf = GridSearchCV(model, tuned_parameters, cv=n_folds, refit=False, scoring='neg_mean_squared_error')
clf.fit(X, y)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']

plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)
plt.ylabel('test MSE')
plt.xlabel('alpha')
plt.axhline(np.max(scores), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])
plt.show()

print(clf.best_params_)

In [None]:
from sklearn.model_selection import GridSearchCV
model = Ridge()
alphas = np.logspace(-10, 10, 100)

tuned_parameters = [{'alpha': alphas}]
n_folds = 5

clf = GridSearchCV(model, tuned_parameters, cv=n_folds, refit=False, scoring='neg_mean_squared_error')
clf.fit(X, y)
scores = clf.cv_results_['mean_test_score']
scores_std = clf.cv_results_['std_test_score']

plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores)
plt.ylabel('test MSE')
plt.xlabel('alpha')
plt.axhline(np.max(scores), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])
plt.show()

print(clf.best_params_)