In [1]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
import datetime as dt
import time
import warnings
from scipy import stats
from scipy.stats import norm, skew
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import linear_model, ensemble
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV

warnings.filterwarnings('ignore')
%matplotlib inline
sns.set(style='whitegrid', font_scale=1.3)

In [2]:
train_raw = pd.read_csv('train.csv')

In [3]:
#set df to manipulate, leave raws accessible
train = train_raw.copy()

#lowercase column names
train.columns = [x.lower() for x in train.columns]

#find object datatypes and strip whitespace
to_strip = train.select_dtypes(include='object')
train[to_strip.columns] = to_strip.apply(lambda x: x.str.strip())

#grlivarea outliers
train = train[train.grlivarea < 4000]

In [4]:
#target distribution
print('un-transformed:', stats.normaltest(a=train['saleprice']))
print('log:', stats.normaltest(a=np.log(train['saleprice'])))
print('log1p:', stats.normaltest(a=np.log1p(train['saleprice'])))

un-transformed: NormaltestResult(statistic=460.3067960010808, pvalue=1.1108342380647651e-100)
log: NormaltestResult(statistic=17.417871145375766, pvalue=0.0001651039020773473)
log1p: NormaltestResult(statistic=17.41764459169927, pvalue=0.00016512260558467428)


In [5]:
#same dists, use log1p to play nice with O's
features_num = train.dtypes[train.dtypes != 'object'].index
train[features_num] = np.log1p(train[features_num])

In [6]:
#missing
missing_data = train.isnull().sum().sort_values(ascending=False)
missing_data = missing_data.reset_index()
missing_data.columns = ['variable','rows_missing']
missing_data = missing_data[missing_data['rows_missing'] > 0]

#drop
train.dropna(subset=['electrical'], inplace=True)

#categorical
train['poolqc'].fillna('no_pool', inplace=True)
train['miscfeature'].fillna('no_miscfeat', inplace=True)
train['alley'].fillna('no_access', inplace=True)
train['fence'].fillna('no_fence', inplace=True)
train['fireplacequ'].fillna('no_fireplace', inplace=True)
train['garagecond'].fillna('no_garage', inplace=True)
train['garagetype'].fillna('no_garage', inplace=True)
train['garagefinish'].fillna('no_garage', inplace=True)
train['garagequal'].fillna('no_garage', inplace=True)
train['bsmtexposure'].fillna('no_bsmt', inplace=True)
train['bsmtfintype1'].fillna('no_bsmt', inplace=True)
train['bsmtfintype2'].fillna('no_bsmt', inplace=True)
train['bsmtcond'].fillna('no_bsmt', inplace=True)
train['bsmtqual'].fillna('no_bsmt', inplace=True)
train['masvnrtype'].fillna('no_masvnr', inplace=True)

In [None]:
#train['masvnrarea'].value_counts()

#continuous
#train['lotfrontage'].fillna(0, inplace=True)
#train['garageyrblt'].fillna(0, inplace=True)
#train['masvnrarea'].fillna(0, inplace=True)

In [7]:
train = pd.get_dummies(train)
train = train.fillna(train.mean())

In [8]:
X = train.loc[:, ~(train.columns).isin(['saleprice'])]
y = train['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Linear Models

### Vanilla Linear Regression

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

cv=10
rmse_train = np.sqrt(-cross_val_score(lr, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(lr, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

print('model: vanilla linear')
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())

model: vanilla linear
train rmse: 365315666.2935695
test rmse: 14253894554.563004
test - train =  13888578888.269434


__Result:__ hahahahaha

### L2/Ridge Regularization

In [13]:
#broad alpha search
ridge = RidgeCV(alphas=[0.01, 0.025, 0.05, 0.075,
                0.1, 0.25, 0.5, 0.075,
                1, 2.5, 5, 7.5,
                10, 20, 30, 50])

ridge.fit(X_train, y_train)
rmse_train = np.sqrt(-cross_val_score(ridge, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(ridge, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

print('model: ridge')
print('alpha:', ridge.alpha_)
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())

model: ridge
alpha: 7.5
train rmse: 0.11942683397411594
test rmse: 0.13243063429904783
test - train =  0.013003800324931897


In [14]:
#tune alpha
ridge = RidgeCV(alphas=[7.5,
                        5, 5.5, 6, 6.5, 7,
                        8, 8.5, 9, 9.5, 10])

ridge.fit(X_train, y_train)
rmse_train = np.sqrt(-cross_val_score(ridge, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(ridge, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

print('model: ridge')
print('alpha:', ridge.alpha_)
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())

model: ridge
alpha: 7.0
train rmse: 0.1193172500709967
test rmse: 0.13154481842444155
test - train =  0.012227568353444851


__Result:__ this is much better, a lot more accurate but still demonstrates some overfitting

### L1/Lasso Regularization

In [16]:
lasso = LassoCV(alphas=[0.0001, 0.0005,
                        0.001, 0.005, 
                        0.01, 0.05, 
                        0.1, 0.5,
                        1, 5])

lasso.fit(X_train, y_train)
rmse_train = np.sqrt(-cross_val_score(lasso, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(lasso, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

print('model: lasso')
print('alpha:', lasso.alpha_)
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())

model: lasso
alpha: 0.0005
train rmse: 0.11723934548496298
test rmse: 0.11833346568890905
test - train =  0.0010941202039460723


In [17]:
lasso = LassoCV(alphas=[0.0005,
                        0.0001, 0.0002, 0.0003, 0.0004,
                        0.0006, 0.0007, 0.0008, 0.0009, 0.001])

lasso.fit(X_train, y_train)
rmse_train = np.sqrt(-cross_val_score(lasso, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(lasso, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

print('model: lasso')
print('alpha:', lasso.alpha_)
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())

model: lasso
alpha: 0.0005
train rmse: 0.11697161987987023
test rmse: 0.1184812211503045
test - train =  0.0015096012704342715


__Result:__ Best results so far, picked the same value for alpha in both searches, L1 test rmse's of 0.11833 and 0.11848 are the lowest yet, and where arrived at with a lot less overfitting than L2 LR model

## Ensemble Models

### Random Forest Regressor

In [21]:
#broad param search using GridSearchCV
rfr = ensemble.RandomForestRegressor()
params = [{'n_estimators':[10, 50, 75, 100, 200, 300, 400, 500],
           'max_features':[1, 303]}]

start_time = time.clock()
grid = GridSearchCV(estimator=rfr,
                    param_grid=params,
                    scoring='neg_mean_squared_error')

grid.fit(X_train, y_train)

print('params:', grid.best_params_)
print('runtime: ', time.clock() - start_time)

params: {'max_features': 303, 'n_estimators': 300}
runtime:  79.14090499999998


In [22]:
#fine param search
params = [{'n_estimators':[300,
                           225, 250, 275,
                           325, 350, 375],
           'max_features':[303]}]

start_time = time.clock()
grid = GridSearchCV(estimator=rfr,
                    param_grid=params,
                    scoring='neg_mean_squared_error')

grid.fit(X_train, y_train)

print('params:', grid.best_params_)
print('runtime: ', time.clock() - start_time)

params: {'max_features': 303, 'n_estimators': 350}
runtime:  87.14608899999996


In [23]:
#different n_estimators but the same results, use the lower n_estimators
rfr = ensemble.RandomForestRegressor(n_estimators=350, max_features=303)

start_time = time.clock()
rfr.fit(X_train, y_train)

cv=5
rmse_train = np.sqrt(-cross_val_score(rfr, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(rfr, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

print('model: random forest')
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())
print('runtime: ', time.clock() - start_time)

model: random forest
train rmse: 0.14591969608777441
test rmse: 0.14707621650774533
test - train =  0.0011565204199709134
runtime:  43.62875600000007


__Result:__ decent accuracy, not overfitting too much, but not as low rmse as L1 regr

### Gradient Boosting Regressor

In [28]:
gbr = ensemble.GradientBoostingRegressor()
params = [{'loss':['ls'],
           'learning_rate':[0.001, 0.01, 0.1, 1],
           'n_estimators':[250, 500, 750],
           'max_depth':[2, 3, 4],
           'subsample':[0.25, 0.5, 0.75, 1],
           'max_features':[1, 303]}]

start_time = time.clock()
grid = GridSearchCV(estimator=gbr, param_grid=params, scoring='neg_mean_squared_error')

grid.fit(X_train, y_train)
print('params:', grid.best_params_)
print('runtime: ', time.clock() - start_time)

params: {'learning_rate': 0.01, 'loss': 'ls', 'max_depth': 4, 'max_features': 303, 'n_estimators': 750, 'subsample': 0.25}
runtime:  952.3800260000003


In [29]:
#skip this for now it takes 15+ min to run
#refine params
#params = [{'loss':['ls'],
#           'learning_rate':[0.01,
#                            0.0075, 0.005,
#                            0.025, 0.05,],
#           'n_estimators':[750,
#                           700, 725, 775, 800],
#           'max_depth':[4],
#           'subsample':[0.25,
#                        0.1, 0.2,
#           'max_features':[303]}]

#start_time = time.clock()
#grid = GridSearchCV(estimator=gbr, param_grid=params, scoring='neg_mean_squared_error')

#grid.fit(X_train, y_train)
#print('params:', grid.best_params_)
#print('runtime: ', time.clock() - start_time)

In [30]:
gbr = ensemble.GradientBoostingRegressor(loss='ls',
                                        learning_rate=0.01,
                                        n_estimators=750,
                                        max_depth=4,
                                        subsample=0.25,
                                        max_features=303)

start_time = time.clock()
gbr.fit(X_train, y_train)

cv=5
rmse_train = np.sqrt(-cross_val_score(gbr, X_train, y_train, cv=cv,
                                      scoring='neg_mean_squared_error'))
rmse_test = np.sqrt(-cross_val_score(gbr, X_test, y_test, cv=cv,
                                     scoring='neg_mean_squared_error'))

print('model: gradient boosting')
print('train rmse:', rmse_train.mean())
print('test rmse:', rmse_test.mean())
print('test - train = ', rmse_test.mean() - rmse_train.mean())
print('runtime: ', time.clock() - start_time)

model: gradient boosting
train rmse: 0.12376588473757395
test rmse: 0.11890135869366411
test - train =  -0.004864526043909842
runtime:  29.048637999999755


__Result:__ not ideal, test set outperformed train set, probably needs further parameter tuning but a huge weak point here is the amount of time it takes to tune the model (sometimes 20+ minutes for a gridsearch). I'm not confident this will outperform L1 Regr, perhaps with further feature engineering. L1 Lasso is the winner here