# Model Evaluation

Process 
1. Import data & set up dummy variables
    * drop 1 column for each grouping!
2. Split data: 
    * Train / Test: X, X_test, y, y_test (~20%)
    * Train / Val: X_train, X_val, y_train, y_val (~60%?)
3. Create 3 Models: Linear, Ridge, Lasso
    * SCALE: Ridge & Lasso
    * Keep default alphas
    * use KFolds
4. Evaluate basic models on scoring (r2 train, r2 val)
    * Use KFolds 
    * if models are really close to one another, tune alphas
    * Pick an alpha range to loop through and compare results 
5. Tune alphas for Ridge & Lasso
    * difference between r2 train and r2 val will be guide
    * if r2 train > r2 val --> overfitting --> increase alpha
6. Pick best version of each model to evaluate
    * linear
    * ridge at best alpha
    * lasso at best alpha
7. RETRAIN best model on entire training data (X)
8. Evaluate model against hold out data (predict x_test vs. y_test) 
    * r2 train versus r2 test
    * if difference is significant, more model tuning

In [1]:
# Necessary imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, lars_path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

sns.set(context='notebook', style='whitegrid', font_scale=1.2)

## Data Import & Setup

In [2]:
sf = pd.read_csv('data_files/sf_clean.csv')

# Reformatting hood_district names
sf['hood_district'] = sf['hood_district'].astype(int)
sf['hood_district'] = sf['hood_district'].astype(str)

In [3]:
sf = pd.get_dummies(sf)

In [5]:
sf.columns

Index(['price', 'sqft', 'beds', 'bath', 'laundry_(a) in-unit',
       'laundry_(b) on-site', 'laundry_(c) no laundry', 'pets_(a) both',
       'pets_(b) dogs', 'pets_(c) cats', 'pets_(d) no pets',
       'housing_type_(a) single', 'housing_type_(b) double',
       'housing_type_(c) multi', 'parking_(a) valet', 'parking_(b) protected',
       'parking_(c) off-street', 'parking_(d) no parking', 'hood_district_1',
       'hood_district_10', 'hood_district_2', 'hood_district_3',
       'hood_district_4', 'hood_district_5', 'hood_district_6',
       'hood_district_7', 'hood_district_8', 'hood_district_9'],
      dtype='object')

In [6]:
cols_to_drop = ['laundry_(c) no laundry',
               'pets_(d) no pets', 
               'housing_type_(c) multi',
               'parking_(d) no parking',
               'hood_district_10']

sf = sf.drop(cols_to_drop, axis=1)

In [8]:
sf.head()

Unnamed: 0,price,sqft,beds,bath,laundry_(a) in-unit,laundry_(b) on-site,pets_(a) both,pets_(b) dogs,pets_(c) cats,housing_type_(a) single,...,parking_(c) off-street,hood_district_1,hood_district_2,hood_district_3,hood_district_4,hood_district_5,hood_district_6,hood_district_7,hood_district_8,hood_district_9
0,6800,1600.0,2.0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,3500,550.0,1.0,1.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,5100,1300.0,2.0,1.0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,9000,3500.0,3.0,2.5,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,3100,561.0,1.0,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Training & Hold-Out Splits

### Train / Test

In [9]:
X, y = sf.drop('price',axis=1), sf['price']

# hold out 20% of the data for final testing
X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=10)

### Train / Validation

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.20, random_state=10)

## Models

### Linear

In [11]:
lin = LinearRegression()

lin.fit(X_train, y_train)
print(f'Linear Regression train R^2: {lin.score(X_train, y_train):.5f}')
print(f'Linear Regression val R^2: {lin.score(X_val, y_val):.5f}')

Linear Regression train R^2: 0.78184
Linear Regression val R^2: 0.67066


### Ridge

#### Scaling Features for Lasso / Ridge

In [12]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

#### Model Setup

In [13]:
rid = Ridge()

rid.fit(X_train_scaled, y_train)
print(f'Ridge Regression test R^2: {rid.score(X_train_scaled, y_train):.5f}')
print(f'Ridge Regression val R^2: {rid.score(X_val_scaled, y_val):.5f}')

Ridge Regression test R^2: 0.78181
Ridge Regression val R^2: 0.67054


### Lasso

In [14]:
las = Lasso()

las.fit(X_train_scaled, y_train)
print(f'Ridge Regression test R^2: {las.score(X_train_scaled, y_train):.5f}')
print(f'Ridge Regression val R^2: {las.score(X_val_scaled, y_val):.5f}')

Ridge Regression test R^2: 0.78172
Ridge Regression val R^2: 0.67067


### Model Evaluation using KFolds

In [20]:
sf2 = sf.copy()

X, y = sf2.drop('price',axis=1), sf['price']

X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=30)

#this helps with the way kf will generate indices below
X, y = np.array(X), np.array(y)

In [23]:
kf = KFold(n_splits=5, shuffle=True, random_state =12)

cv_lin_r2s, cv_rid_r2s, cv_las_r2s = [], [], []  #collect the validation results for both models

for train_ind, val_ind in kf.split(X,y):
    
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind] 
    
    #simple linear regression
    lin = LinearRegression()
    rid = Ridge()
    las = Lasso()

    lin.fit(X_train, y_train)
    cv_lin_r2s.append(lin.score(X_val, y_val))
    
    #ridge/lasso with feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    rid.fit(X_train_scaled, y_train)
    cv_rid_r2s.append(rid.score(X_val_scaled, y_val))
    
    
    #lasso with feature scaling
    las.fit(X_train_scaled, y_train)
    cv_las_r2s.append(las.score(X_val_scaled, y_val))

print('Simple scores: ', cv_lin_r2s)
print('Ridge scores: ', cv_rid_r2s)
print('Lasso scores: ', cv_las_r2s, '\n')

print(f'Simple mean cv r^2: {np.mean(cv_lin_r2s):.5f} +- {np.std(cv_lin_r2s):.5f}')
print(f'Ridge mean cv r^2: {np.mean(cv_rid_r2s):.5f} +- {np.std(cv_rid_r2s):.5f}')
print(f'Lasso mean cv r^2: {np.mean(cv_las_r2s):.5f} +- {np.std(cv_las_r2s):.5f}')

Simple scores:  [0.7902121151705227, 0.678231918470749, 0.6539934339757945, 0.7160366020304101, 0.8024668639219245]
Ridge scores:  [0.789965695329414, 0.6794760528664688, 0.6538530428786878, 0.7162344132034061, 0.8022393400128285]
Lasso scores:  [0.7904453069834163, 0.6802749348235684, 0.6541668011093027, 0.7158473766765832, 0.8020956988709493] 

Simple mean cv r^2: 0.72819 +- 0.05918
Ridge mean cv r^2: 0.72835 +- 0.05889
Lasso mean cv r^2: 0.72857 +- 0.05876


## Alpha Tuning - Ridge & Lasso

In [24]:
sf3 = sf.copy()

X, y = sf3.drop('price',axis=1), sf['price']

X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=30)

#this helps with the way kf will generate indices below
#X, y = np.array(X), np.array(y)

# KFolds

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.20, random_state=10)

In [26]:
# Scalilng features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.values)
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

### Lasso

In [36]:
alphavec = 10**np.linspace(-2,2,10000)

lasso_model = LassoCV(alphas = alphavec, cv=5)
lasso_model.fit(X_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-02, 1.00092155e-02, 1.00184395e-02, ...,
       9.98159444e+01, 9.99079298e+01, 1.00000000e+02]),
        cv=5)

In [41]:
las_alpha = lasso_model.alpha_
las_alpha

2.1253611770864973

In [42]:
las = Lasso(alpha=las_alpha)

las.fit(X_train_scaled, y_train)
print(f'Ridge Regression test R^2: {las.score(X_train_scaled, y_train):.5f}')
print(f'Ridge Regression val R^2: {las.score(X_val_scaled, y_val):.5f}')

Ridge Regression test R^2: 0.73848
Ridge Regression val R^2: 0.78571


### Ridge

In [44]:
alphavec = 10**np.linspace(-2,2,100)

ridge_model = RidgeCV(alphas = alphavec, cv=5)
ridge_model.fit(X_train_scaled, y_train)

RidgeCV(alphas=array([1.00000000e-02, 1.09749877e-02, 1.20450354e-02, 1.32194115e-02,
       1.45082878e-02, 1.59228279e-02, 1.74752840e-02, 1.91791026e-02,
       2.10490414e-02, 2.31012970e-02, 2.53536449e-02, 2.78255940e-02,
       3.05385551e-02, 3.35160265e-02, 3.67837977e-02, 4.03701726e-02,
       4.43062146e-02, 4.86260158e-02, 5.33669923e-02, 5.85702082e-02,
       6.42807312e-02, 7.05480231e-0...
       1.17681195e+01, 1.29154967e+01, 1.41747416e+01, 1.55567614e+01,
       1.70735265e+01, 1.87381742e+01, 2.05651231e+01, 2.25701972e+01,
       2.47707636e+01, 2.71858824e+01, 2.98364724e+01, 3.27454916e+01,
       3.59381366e+01, 3.94420606e+01, 4.32876128e+01, 4.75081016e+01,
       5.21400829e+01, 5.72236766e+01, 6.28029144e+01, 6.89261210e+01,
       7.56463328e+01, 8.30217568e+01, 9.11162756e+01, 1.00000000e+02]),
        cv=5)

In [45]:
rid_alpha = ridge_model.alpha_
rid_alpha

2.656087782946687

In [None]:
rid = Ridge(alpha=rid_alpha)

rid.fit(X_train_scaled, y_train)
print(f'Ridge Regression test R^2: {rid.score(X_train_scaled, y_train):.5f}')
print(f'Ridge Regression val R^2: {rid.score(X_val_scaled, y_val):.5f}')