# Kaggle Model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder

In [2]:
eda_train = pd.read_csv('../datasets/feat_train.csv')
eda_test = pd.read_csv('../datasets/feat_test.csv')

In [3]:
eda_train.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,foundation_CBlock,foundation_PConc,foundation_Slab,foundation_Stone,foundation_Wood
0,109,533352170,60,RL,0.0,13517.0,Pave,0,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,544,531379050,60,RL,43.0,11492.0,Pave,0,1,AllPub,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,153,535304180,20,RL,68.0,7922.0,Pave,1,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,318,916386060,60,RL,73.0,9802.0,Pave,1,1,AllPub,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,255,906425045,50,RL,82.0,14235.0,Pave,0,1,AllPub,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
eda_test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker,foundation_CBlock,foundation_PConc,foundation_Slab,foundation_Stone,foundation_Wood
0,2658,902301120,190,RM,69.0,9142,Pave,1,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2718,905108090,90,RL,0.0,9662,Pave,0,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2414,528218130,60,RL,58.0,17104,Pave,0,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1989,902207150,30,RM,60.0,8520,Pave,1,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,625,535105100,20,RL,0.0,9500,Pave,0,1,AllPub,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [5]:
features = ['lot_area', 'overall_qual', 'overall_cond', 'year_built', 'year_remod/add', 
            'exter_qual', 'exter_cond', 'bsmt_qual', 'bsmt_cond', 'heating_qc', 'kitchen_qual', 'garage_area', 
           'totrms_abvgrd', 'full_bath', '1st_flr_sf', 'gr_liv_area', 'garage_cars', 'land_contour', 
            'bldg_type', 'house_style', 'neighborhood_Blueste', 'lot_shape', 
       'neighborhood_BrDale', 'neighborhood_BrkSide', 'neighborhood_ClearCr',
       'neighborhood_CollgCr', 'neighborhood_Crawfor', 'neighborhood_Edwards',
       'neighborhood_Gilbert', 'neighborhood_Greens', 'neighborhood_GrnHill',
       'neighborhood_IDOTRR', 'neighborhood_Landmrk', 'neighborhood_MeadowV',
       'neighborhood_Mitchel', 'neighborhood_NAmes', 'neighborhood_NPkVill',
       'neighborhood_NWAmes', 'neighborhood_NoRidge', 'neighborhood_NridgHt',
       'neighborhood_OldTown', 'neighborhood_SWISU', 'neighborhood_Sawyer',
       'neighborhood_SawyerW', 'neighborhood_Somerst', 'neighborhood_StoneBr',
       'neighborhood_Timber', 'neighborhood_Veenker', 'foundation_CBlock',
       'foundation_PConc', 'foundation_Slab', 'foundation_Stone',
       'foundation_Wood']

In [6]:
X = eda_train[features]
y = eda_train['saleprice']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [8]:
lr = LinearRegression()

In [9]:
lr.fit(X_train, y_train)

In [10]:
print(f'Training R2: {lr.score(X_train, y_train)}')
print(f'Test R2: {lr.score(X_test, y_test)}')

Training R2: 0.8954655339259434
Test R2: 0.8754802323272379


In [11]:
preds = lr.predict(X_test)
preds = np.exp(preds)
print(f'MSE: {mean_squared_error(y_test, preds)}')
print(f'RMSE: {mean_squared_error(y_test, preds, squared = False)}')

MSE: 38454039392.18005
RMSE: 196097.01525566383


In [12]:
new_preds = np.exp(lr.predict(eda_test[features]))
new_preds

array([ 136104.76625966,  175506.80681057,  206329.15455658,
        108362.4847548 ,  162798.66474798,   91059.99320251,
        109563.0341803 ,  161642.70419571,  193409.57272452,
        158566.68669775,  152414.16514954,  113898.47939047,
        157345.28380286,  262765.64681264,  157936.40657528,
        120228.18227031,  127967.11445277,  111517.79678317,
        191229.47217975,  191414.61930111,  152392.3921854 ,
        122935.13827791,  185684.83060461,  175980.80325956,
        192648.31742502,  121834.28894401,  129533.74088121,
        116373.96108062,  165617.61413827,   73276.20125557,
        110911.71741648,   90404.68270017,  225646.13250013,
        150497.85765183,  212548.22821114,  189494.86568389,
        108206.18231753,   96133.78850432,  143900.85228909,
        203416.83948559,  159212.07479866,  211583.43701108,
        152373.95171777,  164231.38724148,  202159.13236711,
         97129.6327282 ,  213503.04430043,  110223.47353829,
        126592.24153114,

In [13]:
eda_test['SalePrice'] = new_preds 

In [14]:
kaggle = eda_test[['id', 'SalePrice']]

In [15]:
kaggle.to_csv('../datasets/ben_kaggle_sub.csv', index=False)