## Models, Models, Models!

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

### Train/Validation Split

In [3]:
#read in chosen clean data set and specify which columns to analyze
cols_of_interest = ['garage_area', 'total_bsmt_sf', 'bsmt_full_bath', 'full_bath', 'bsmt_half_bath', 'half_bath','overall_qual', 'gr_liv_area', 'year_built', 'year_remod/add']
data = pd.read_csv('../data/cleaned_train')
X = data[cols_of_interest]
y = data['saleprice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=8)




### Null Model

Below is our RMSE score and percent variance to beat.

In [4]:
null_pred = y_train.mean()
null_train_rmse = mean_squared_error(y_train, np.full_like(y_train, null_pred), squared=False)
null_val_rmse = mean_squared_error(y_val, np.full_like(y_val, null_pred), squared=False)
print(f"Null Training RMSE: {round(null_train_rmse, 2)}")
print(f"Null Validation RMSE: {round(null_val_rmse, 2)}")
print(f"Null RMSE Variance: {round(((null_val_rmse - null_train_rmse)/null_train_rmse) * 100, 2)}%")

Null Training RMSE: 78287.72
Null Validation RMSE: 82946.12
Null RMSE Variance: 5.95%


### Iterative Modeling

#### Helper Functions

In [5]:
#could also use make_scorer
def rmse(estimator, X, y):
    '''
    Converts SkLearn's mean_squared_error function into one useable by cross_val_score
    Returns the Root Mean Squared Error of the given estimator's predictions on X, compared to y
    '''
    return mean_squared_error(y, estimator.predict(X), squared=False)

In [6]:
def print_rmse(model, y_train, y_train_pred, y_val, y_val_pred):
    '''
    Prints the training and validation Root Mean Squared Error for the given estimator with the given predictions
    Returns the Training RMSE, Validation RMSE, and Percent Variance between the two
    '''
    train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
    val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"{model} Training RMSE: {round(train_rmse, 2)}")
    print(f"{model} Validation RMSE: {round(val_rmse, 2)}")
    return train_rmse, val_rmse, ((val_rmse - train_rmse)/train_rmse) * 100

#### Linear Regression

In [7]:
#scale data
sc = StandardScaler()
X_train_sc =  sc.fit_transform(X_train)
X_val_sc = sc.transform(X_val)

#instantiate and fit
lr = LinearRegression()
lr.fit(X_train_sc, y_train)

#predict
lr_train_preds = lr.predict(X_train_sc)
lr_val_preds = lr.predict(X_val_sc)

#evaluate
print_rmse(lr, y_train, lr_train_preds, y_val, lr_val_preds)

LinearRegression() Training RMSE: 34452.87
LinearRegression() Validation RMSE: 40499.58


(34452.86503832578, 40499.576009698474, 17.550676742402306)

In [8]:
cross_val_score(lr, X_train, y_train, scoring=rmse)

array([33264.21823226, 32061.22055741, 35195.27559991, 44416.29260667,
       32164.73128547])

In [9]:
X_train._get_numeric_data().columns


Index(['garage_area', 'total_bsmt_sf', 'bsmt_full_bath', 'full_bath',
       'bsmt_half_bath', 'half_bath', 'overall_qual', 'gr_liv_area',
       'year_built', 'year_remod/add'],
      dtype='object')

#### Lasso

In [42]:
#adapted from class code extra-extras

ctx = ColumnTransformer(
    [('ss', StandardScaler(), X_train._get_numeric_data().columns),
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'), [x for x in X_train.columns if x not in X_train._get_numeric_data().columns])
    ]
)

lasso_pipe = Pipeline([
    ('ctx', ctx),
    ('lasso', Lasso())
])

lasso_params = {
    'ctx__remainder' : ['passthrough'], # keep untransformed columns
    'lasso__alpha' : np.linspace(2000, 3000, num=1000)
}

In [43]:
lasso_gs = GridSearchCV(lasso_pipe, lasso_params, scoring=rmse).fit(X_train, y_train)

In [44]:
lasso_gs.best_score_, lasso_gs.best_params_, lasso_gs.best_estimator_.named_steps.lasso.coef_

(35778.6998781654,
 {'ctx__remainder': 'passthrough', 'lasso__alpha': 3000.0},
 array([ 9284.69762423,  9958.47419588,  5373.87258015,     0.        ,
            0.        ,     0.        , 27839.01757434, 24717.87644094,
         5409.21331499,  4847.20630276]))

In [38]:
lasso_gs.best_score_, lasso_gs.best_params_, lasso_gs.best_estimator_.named_steps.lasso.coef_

(35502.19265327981,
 {'ctx__remainder': 'passthrough', 'lasso__alpha': 829.0},
 array([ 9731.59928579, 10318.50364051,  7142.4197855 ,  -153.37274988,
         1123.96482022,    -0.        , 27581.99452329, 26194.96838186,
         6096.08158308,  5794.23440427]))

In [None]:
cross_val_score(lasso_gs, X_train, y_train, scoring=rmse)

#### Ridge

In [45]:
#adapted from class code extra-extras

ctx = ColumnTransformer(
    [('ss', StandardScaler(), X_train._get_numeric_data().columns),
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'), [x for x in X_train.columns if x not in X_train._get_numeric_data().columns])
    ]
)

ridge_pipe = Pipeline([
    ('ctx', ctx),
    ('ridge', Ridge())
])

ridge_params = {
    'ctx__remainder' : ['passthrough'], # keep untransformed columns
    'ridge__alpha' : np.linspace(1, 3000, num=3000)
}

In [None]:
ridge_gs = GridSearchCV(ridge_pipe, ridge_params, scoring=rmse).fit(X_train, y_train)

In [None]:
ridge_gs.best_score_, ridge_gs.best_params_

In [None]:
cross_val_score(ridge_gs, X_train, y_train, scoring=rmse)