# Modeling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats

from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, PowerTransformer, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [2]:
lr = LinearRegression()
ridge = RidgeCV(alphas = np.logspace(0, 5, 200),cv = 5)
lasso = LassoCV(n_alphas = 500, cv = 5, random_state = 42)

In [3]:
train = pd.read_csv('./datasets/clean_train.csv')
test = pd.read_csv('./datasets/clean_test.csv')

In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 70 columns):
id                   879 non-null int64
pid                  879 non-null int64
ms_subclass          879 non-null int64
ms_zoning            879 non-null object
lot_frontage         879 non-null float64
lot_area             879 non-null int64
street               879 non-null object
lot_shape            879 non-null object
land_contour         879 non-null object
utilities            879 non-null object
lot_config           879 non-null object
land_slope           879 non-null object
neighborhood         879 non-null object
condition_1          879 non-null object
condition_2          879 non-null object
bldg_type            879 non-null object
house_style          879 non-null object
overall_qual         879 non-null int64
overall_cond         879 non-null int64
year_built           879 non-null int64
year_remod/add       879 non-null int64
roof_style           879 non-null object


In [5]:
train['ms_subclass'] = train['ms_subclass'].astype('object')
test['ms_subclass'] = test['ms_subclass'].astype('object')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 71 columns):
id                   2025 non-null int64
pid                  2025 non-null int64
ms_subclass          2025 non-null object
ms_zoning            2025 non-null object
lot_frontage         2025 non-null float64
lot_area             2025 non-null int64
street               2025 non-null object
lot_shape            2025 non-null object
land_contour         2025 non-null object
utilities            2025 non-null object
lot_config           2025 non-null object
land_slope           2025 non-null object
neighborhood         2025 non-null object
condition_1          2025 non-null object
condition_2          2025 non-null object
bldg_type            2025 non-null object
house_style          2025 non-null object
overall_qual         2025 non-null int64
overall_cond         2025 non-null int64
year_built           2025 non-null int64
year_remod/add       2025 non-null int64
roof_style       

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 70 columns):
id                   879 non-null int64
pid                  879 non-null int64
ms_subclass          879 non-null object
ms_zoning            879 non-null object
lot_frontage         879 non-null float64
lot_area             879 non-null int64
street               879 non-null object
lot_shape            879 non-null object
land_contour         879 non-null object
utilities            879 non-null object
lot_config           879 non-null object
land_slope           879 non-null object
neighborhood         879 non-null object
condition_1          879 non-null object
condition_2          879 non-null object
bldg_type            879 non-null object
house_style          879 non-null object
overall_qual         879 non-null int64
overall_cond         879 non-null int64
year_built           879 non-null int64
year_remod/add       879 non-null int64
roof_style           879 non-null object

In [8]:
numeric_cols_train = [col for col in train.select_dtypes(exclude=['object'])]
numeric_cols_test = [col for col in test.select_dtypes(exclude=['object'])]

categorical_cols_train = [col for col in train.select_dtypes('object')]
categorical_cols_test = [col for col in test.select_dtypes('object')]

In [9]:
categorical_cols_test

['ms_subclass',
 'ms_zoning',
 'street',
 'lot_shape',
 'land_contour',
 'utilities',
 'lot_config',
 'land_slope',
 'neighborhood',
 'condition_1',
 'condition_2',
 'bldg_type',
 'house_style',
 'roof_style',
 'roof_matl',
 'exterior_1st',
 'exterior_2nd',
 'mas_vnr_type',
 'foundation',
 'bsmtfin_type_1',
 'bsmtfin_type_2',
 'heating',
 'central_air',
 'electrical',
 'garage_type',
 'paved_drive',
 'fence',
 'sale_type']

In [10]:
categorical_cols_train

['ms_subclass',
 'ms_zoning',
 'street',
 'lot_shape',
 'land_contour',
 'utilities',
 'lot_config',
 'land_slope',
 'neighborhood',
 'condition_1',
 'condition_2',
 'bldg_type',
 'house_style',
 'roof_style',
 'roof_matl',
 'exterior_1st',
 'exterior_2nd',
 'mas_vnr_type',
 'foundation',
 'bsmtfin_type_1',
 'bsmtfin_type_2',
 'heating',
 'central_air',
 'electrical',
 'garage_type',
 'paved_drive',
 'fence',
 'sale_type']

In [11]:
train = pd.get_dummies(data = train, columns = categorical_cols_train, drop_first = True)

In [12]:
train.shape

(2025, 218)

In [13]:
test = pd.get_dummies(data = test, columns = categorical_cols_test, drop_first = True)

In [14]:
test.shape

(879, 209)

In [15]:
missing_columns = set(train.columns) - set(test.columns)
missing_columns

{'condition_2_Feedr',
 'condition_2_RRAe',
 'condition_2_RRAn',
 'condition_2_RRNn',
 'electrical_Mix',
 'exterior_1st_CBlock',
 'exterior_1st_ImStucc',
 'exterior_1st_Stone',
 'exterior_2nd_Stone',
 'heating_OthW',
 'heating_Wall',
 'ms_subclass_150',
 'ms_zoning_C (all)',
 'neighborhood_GrnHill',
 'neighborhood_Landmrk',
 'roof_matl_Membran',
 'saleprice',
 'utilities_NoSeWa'}

In [16]:
for column in missing_columns:
    if column != 'saleprice':
        test[column] = 0

In [17]:
test.shape

(879, 226)

In [18]:
train.shape

(2025, 218)

In [19]:
missing_from_train = set(test.columns) - set(train.columns)

In [20]:
for column in missing_from_train:
    train[column] = 0

In [21]:
train.shape

(2025, 227)

In [22]:
test.shape

(879, 226)

In [23]:
train.columns

Index(['id', 'pid', 'lot_frontage', 'lot_area', 'overall_qual', 'overall_cond',
       'year_built', 'year_remod/add', 'mas_vnr_area', 'exter_qual',
       ...
       'sale_type_WD ', 'mas_vnr_type_CBlock', 'exterior_2nd_Other',
       'roof_matl_Metal', 'heating_GasA', 'roof_matl_Roll',
       'exterior_1st_PreCast', 'sale_type_VWD', 'electrical_None',
       'exterior_2nd_PreCast'],
      dtype='object', length=227)

In [24]:
train.to_csv('./datasets/train_with_dummies.csv')
test.to_csv('./datasets/test_with_dummies.csv')

In [25]:
features = [x for x in train.columns if x not in ['saleprice', 'id', 'pid']]

In [26]:
X = train[features]
y = train['saleprice']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [28]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [29]:
lr.score(X_train, y_train)

0.9340477469838739

In [30]:
lr.score(X_test, y_test)

0.8939464078439503

In [31]:
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.8743410324079404

In [32]:
lr_preds = lr.predict(X)

In [33]:
lr_preds

array([178181.78446536, 234168.16818247, 140781.23662835, ...,
       183135.66710523, 121854.63458701, 192683.3994646 ])

In [34]:
lr_preds = lr.predict(X)
metrics.mean_squared_error(y, lr_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, lr_preds))
rmse

22328.805731503966

# Ridge Model

In [35]:
ridge.fit(X_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.05956018e+00, 1.12266777e+00, 1.18953407e+00,
       1.26038293e+00, 1.33545156e+00, 1.41499130e+00, 1.49926843e+00,
       1.58856513e+00, 1.68318035e+00, 1.78343088e+00, 1.88965234e+00,
       2.00220037e+00, 2.12145178e+00, 2.24780583e+00, 2.38168555e+00,
       2.52353917e+00, 2.67384162e+00, 2.83309610e+00, 3.00183581e+00,
       3.18062569e+00, 3.37006433e+0...
       3.33129479e+04, 3.52970730e+04, 3.73993730e+04, 3.96268864e+04,
       4.19870708e+04, 4.44878283e+04, 4.71375313e+04, 4.99450512e+04,
       5.29197874e+04, 5.60716994e+04, 5.94113398e+04, 6.29498899e+04,
       6.66991966e+04, 7.06718127e+04, 7.48810386e+04, 7.93409667e+04,
       8.40665289e+04, 8.90735464e+04, 9.43787828e+04, 1.00000000e+05]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [36]:
ridge.score(X_train, y_train)

0.9241885435265851

In [37]:
ridge.score(X_test, y_test)

0.9082010856364274

In [38]:
cross_val_score(ridge, X_train, y_train, cv = 5).mean()

0.8972197156796623

In [39]:
# This is the same as ridge.score
metrics.r2_score(y_test, ridge.predict(X_test))

0.9082010856364274

In [40]:
ridge_preds = ridge.predict(X)
metrics.mean_squared_error(y, ridge_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, ridge_preds))
rmse

22539.96289716028

In [41]:
ridge.alpha_

12.750512407130135

# LASSO model

In [42]:
lasso.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=42, selection='cyclic',
        tol=0.0001, verbose=False)

In [43]:
lasso.score(X_train, y_train)

0.8066180050197833

In [44]:
lasso.score(X_test, y_test)

0.80168847728729

In [45]:
lasso.alpha_

129866.29354370323

In [46]:
cross_val_score(lasso, X_train, y_train, cv = 5).mean()

0.7960153542056269

In [47]:
# This is the same as lasso.score
metrics.r2_score(y_test, lasso.predict(X_test))

0.80168847728729

In [48]:
lass_preds = lasso.predict(X)
metrics.mean_squared_error(y, lass_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, lass_preds))
rmse

34906.7449263208

In [49]:
lasso.coef_

array([  0.        ,   0.76452131,   0.        ,   0.        ,
       289.55418534, 332.64610209,  52.81354453,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,  10.67114053,  -0.        ,
        -0.        ,   0.        ,  -0.        ,   0.        ,
         0.        ,   0.        ,  -1.14296832,   0.        ,
        57.67952183,   0.        ,   0.        ,  25.03602568,
        24.61228014,  -0.        ,   0.        ,  26.06686799,
        -0.        ,  -1.88002415,   0.        ,   0.        ,
         0.        ,  51.99778944,  -0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,  -0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,  -0.        ,   0.        ,  -0.        ,
        -0.        ,   0.        ,  -0.        ,  -0.        ,
         0.        ,   0.        ,  -0.        ,   0.        ,
        -0.        ,   0.        ,   0.        ,  -0.  

# Scaled Models

In [50]:
ss = StandardScaler()

In [51]:
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)
test_sc = ss.transform(test[features])

## Ridge model scaled

In [52]:
ridge_scaled = RidgeCV(alphas = np.logspace(0, 5, 200),cv = 5)

In [53]:
ridge_scaled.fit(X_train_sc, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.05956018e+00, 1.12266777e+00, 1.18953407e+00,
       1.26038293e+00, 1.33545156e+00, 1.41499130e+00, 1.49926843e+00,
       1.58856513e+00, 1.68318035e+00, 1.78343088e+00, 1.88965234e+00,
       2.00220037e+00, 2.12145178e+00, 2.24780583e+00, 2.38168555e+00,
       2.52353917e+00, 2.67384162e+00, 2.83309610e+00, 3.00183581e+00,
       3.18062569e+00, 3.37006433e+0...
       3.33129479e+04, 3.52970730e+04, 3.73993730e+04, 3.96268864e+04,
       4.19870708e+04, 4.44878283e+04, 4.71375313e+04, 4.99450512e+04,
       5.29197874e+04, 5.60716994e+04, 5.94113398e+04, 6.29498899e+04,
       6.66991966e+04, 7.06718127e+04, 7.48810386e+04, 7.93409667e+04,
       8.40665289e+04, 8.90735464e+04, 9.43787828e+04, 1.00000000e+05]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [54]:
ridge_scaled.score(X_train_sc, y_train)

0.9261308461456028

In [55]:
ridge_scaled.score(X_test_sc, y_test)

0.89942592320585

In [56]:
cross_val_score(ridge_scaled, X_train_sc, y_train, cv = 5).mean()

0.8921860422996994

In [57]:
cross_val_score(ridge_scaled, X_test_sc, y_test, cv = 5).mean()

0.8968058740591761

In [58]:
ridge_scaled.predict(X)

array([1.07499427e+08, 1.14265480e+08, 7.86764805e+07, ...,
       9.15215750e+07, 9.42558797e+07, 9.17642490e+07])

In [59]:
ridge_scaled.predict(test[features])

array([1.00824784e+08, 1.19515676e+08, 1.20167299e+08, 7.90407532e+07,
       9.99981017e+07, 4.64546738e+07, 7.91245213e+07, 9.19388213e+07,
       7.09762302e+07, 9.57815425e+07, 8.24023495e+07, 7.53162389e+07,
       1.04569177e+08, 1.17694480e+08, 1.06114211e+08, 7.31664038e+07,
       9.78879443e+07, 8.63354889e+07, 1.10878821e+08, 1.30970770e+08,
       8.08947610e+07, 7.67361504e+07, 9.99449482e+07, 1.33036165e+08,
       9.31839811e+07, 6.92246024e+07, 1.06087274e+08, 8.97534387e+07,
       8.23635817e+07, 4.40417132e+07, 7.12083250e+07, 7.14327601e+07,
       1.31019526e+08, 9.27541429e+07, 1.10706464e+08, 1.02884390e+08,
       8.11710252e+07, 6.62851812e+07, 6.60758333e+07, 8.87793849e+07,
       6.06254321e+07, 1.07538272e+08, 8.35545170e+07, 1.13643086e+08,
       8.16601626e+07, 5.12078848e+07, 1.07226357e+08, 7.35531170e+07,
       6.66279188e+07, 7.65502542e+07, 8.57487342e+07, 9.71956058e+07,
       1.08307443e+08, 7.78463974e+07, 6.55449731e+07, 7.88535029e+07,
      

In [60]:
r_sc_preds = ridge_scaled.predict(X)
metrics.mean_squared_error(y, r_sc_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, r_sc_preds))
rmse

101609625.4707818

## LASSO model scaled

In [61]:
lasso_scaled = LassoCV(n_alphas = 500, cv = 5, random_state = 42)

In [62]:
lasso_scaled.fit(X_train_sc, y_train)

LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=500, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=42, selection='cyclic',
        tol=0.0001, verbose=False)

In [63]:
lasso.alpha_

129866.29354370323

In [64]:
lasso_scaled.score(X_train_sc, y_train)

0.9250297768477768

In [65]:
lasso_scaled.score(X_test_sc, y_test)

0.9008778193451149

In [66]:
cross_val_score(lasso_scaled, X_train_sc, y_train, cv = 5).mean()

0.8958561864791239

In [67]:
cross_val_score(lasso_scaled, X_test_sc, y_test, cv = 5).mean()

0.8987519004427419

In [68]:
lasso_preds = lasso_scaled.predict(X)
metrics.mean_squared_error(y, lasso_preds)

rmse = np.sqrt(metrics.mean_squared_error(y, lasso_preds))
rmse

127421445.80501254

In [69]:
lasso_scaled.coef_

array([ 0.00000000e+00,  3.71459157e+03,  1.02888817e+04,  5.46939646e+03,
        4.31248729e+03,  2.13637725e+03,  5.47044250e+03,  5.76602251e+03,
        0.00000000e+00,  3.15136105e+03, -0.00000000e+00,  3.72895456e+03,
        0.00000000e+00,  0.00000000e+00,  8.76409864e+03, -2.59914231e+03,
       -2.68501521e+03,  5.25706769e+03,  0.00000000e+00,  2.79802838e+03,
        3.88269302e+03,  0.00000000e+00, -0.00000000e+00,  1.69040915e+03,
        4.93728877e+03,  0.00000000e+00, -0.00000000e+00,  8.91209897e+02,
        1.99665037e+01, -0.00000000e+00,  0.00000000e+00,  2.54613540e+03,
        0.00000000e+00,  0.00000000e+00, -0.00000000e+00,  1.89248524e+02,
        3.66826034e+03,  2.22506734e+04, -0.00000000e+00,  9.55344388e+02,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        0.00000000e+00, -6.35882690e+01,  5.10306761e+02, -0.00000000e+00,
        0.00000000e+00, -2.16353924e+02, -1.18660656e+03, -6.02406755e+02,
       -0.00000000e+00, -

In [70]:
lasso_coefs = pd.DataFrame({'variable':X.columns,
                            'coef':lasso_scaled.coef_,
                            'abs_coef':np.abs(lasso_scaled.coef_)})

lasso_coefs.sort_values('abs_coef', inplace=True, ascending=False)

lasso_coefs.head(25)

Unnamed: 0,variable,coef,abs_coef
37,total_sf,22250.673441,22250.673441
2,overall_qual,10288.88167,10288.88167
14,gr_liv_area,8764.09864,8764.09864
7,exter_qual,5766.022505,5766.022505
6,mas_vnr_area,5470.442505,5470.442505
3,overall_cond,5469.39646,5469.39646
17,kitchen_qual,5257.067693,5257.067693
94,neighborhood_NridgHt,5029.531333,5029.531333
24,garage_area,4937.288774,4937.288774
177,bsmtfin_type_1_No Basement,4521.999965,4521.999965


In [71]:
print('Percent variables zeroed out:', np.sum((lasso_scaled.coef_ == 0))/float(X.shape[1]))

Percent variables zeroed out: 0.53125


In [72]:
X.shape

(2025, 224)