# Baseline Model

In [1]:
import pandas as pd
from statsmodels.formula.api import ols
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
import statsmodels.api as sm
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('kc_house_data_clean.csv')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0.0,3,7,1180,0.0,1955,98178,47.5112,-122.257,1340,5650
1,1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,3,7,2170,400.0,1951,98125,47.721,-122.319,1690,7639
2,2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0.0,3,6,770,0.0,1933,98028,47.7379,-122.233,2720,8062
3,3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0.0,5,7,1050,910.0,1965,98136,47.5208,-122.393,1360,5000
4,4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0.0,3,8,1680,0.0,1987,98074,47.6168,-122.045,1800,7503


In [4]:
df.drop(['Unnamed: 0'], axis = 1 , inplace=True)

In [5]:
predictors = df.drop(['price', 'date', 'id'], axis= 1)
predictors_int = sm.add_constant(predictors)
model = sm.OLS(df['price'],predictors_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.676
Model:,OLS,Adj. R-squared:,0.675
Method:,Least Squares,F-statistic:,2810.0
Date:,"Wed, 22 Jul 2020",Prob (F-statistic):,0.0
Time:,16:38:47,Log-Likelihood:,-295230.0
No. Observations:,21597,AIC:,590500.0
Df Residuals:,21580,BIC:,590600.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.144e+06,3.02e+06,-0.709,0.478,-8.07e+06,3.78e+06
bedrooms,-4.289e+04,1967.504,-21.797,0.000,-4.67e+04,-3.9e+04
bathrooms,4.67e+04,3367.375,13.869,0.000,4.01e+04,5.33e+04
sqft_living,123.6175,18.785,6.581,0.000,86.798,160.437
sqft_lot,0.1595,0.050,3.202,0.001,0.062,0.257
floors,1.13e+04,3739.250,3.022,0.003,3970.688,1.86e+04
waterfront,3.859e+05,1.23e+04,31.384,0.000,3.62e+05,4.1e+05
condition,2.612e+04,2410.463,10.835,0.000,2.14e+04,3.08e+04
grade,1.024e+05,2237.652,45.776,0.000,9.8e+04,1.07e+05

0,1,2,3
Omnibus:,18378.936,Durbin-Watson:,1.99
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1696427.707
Skew:,3.604,Prob(JB):,0.0
Kurtosis:,45.816,Cond. No.,214000000.0


In [6]:
# Baseline model

# Remove outliers

In [7]:
df2=df.copy()

In [8]:
df2['month'] = [df2['date'][i][5:7] for i in range(len(df2['date']))]

In [9]:
df2['month']= df2['month'].astype(int)

In [10]:
df2.drop(['id', 'date'], axis=1, inplace=True)

In [11]:
from scipy import stats
df2 = df2[(np.abs(stats.zscore(df2)) < 3).all(axis=1)]

In [12]:
predictors = df2.drop(['price'], axis= 1)
predictors_int = sm.add_constant(predictors)
model = sm.OLS(df2['price'],predictors_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.682
Model:,OLS,Adj. R-squared:,0.681
Method:,Least Squares,F-statistic:,2643.0
Date:,"Wed, 22 Jul 2020",Prob (F-statistic):,0.0
Time:,16:38:48,Log-Likelihood:,-261960.0
No. Observations:,19769,AIC:,524000.0
Df Residuals:,19752,BIC:,524100.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.219e+06,2.09e+06,-3.447,0.001,-1.13e+07,-3.11e+06
bedrooms,-1.816e+04,1494.597,-12.149,0.000,-2.11e+04,-1.52e+04
bathrooms,3.225e+04,2445.909,13.185,0.000,2.75e+04,3.7e+04
sqft_living,87.7888,13.837,6.345,0.000,60.667,114.910
sqft_lot,0.2698,0.146,1.845,0.065,-0.017,0.556
floors,3.106e+04,2666.448,11.647,0.000,2.58e+04,3.63e+04
waterfront,-1.233e-05,3.58e-06,-3.447,0.001,-1.93e-05,-5.32e-06
condition,2.806e+04,1680.628,16.698,0.000,2.48e+04,3.14e+04
grade,9.168e+04,1605.671,57.100,0.000,8.85e+04,9.48e+04

0,1,2,3
Omnibus:,4926.735,Durbin-Watson:,1.961
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19224.122
Skew:,1.197,Prob(JB):,0.0
Kurtosis:,7.196,Cond. No.,2.57e+19


In [13]:
# Removed outliers to better represent the sample

# Dealing with catagorical variables

## Seperate houses based on location

In [14]:
df3= df2.copy()

In [15]:
df3.loc[(df3['lat'] > 47.5) & (df3['long'] < -122.25), 'NW'] =1

df3.loc[(df3['lat'] > 47.5) & (df3['long'] >= -122.25) & (df['long'] <= -122.1), 'N'] =1

df3.loc[(df3['lat'] > 47.5) & (df3['long'] > -122.1), 'NE'] =1

df3.loc[(df3['lat'] <= 47.5) & (df3['long'] > -122.25), 'SW'] =1

df3.loc[(df3['lat'] <= 47.5) & (df3['long'] >= -122.25) & (df['long'] <= -122.1), 'S'] =1

df3.loc[(df3['lat'] <= 47.5) & (df3['long'] > -122.1), 'SE'] =1

In [16]:
df3.drop(['lat', 'long', 'zipcode', 'SE'] ,axis=1, inplace=True)

In [17]:
df3 = df3.fillna(0)

In [18]:
df3

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,sqft_basement,yr_built,sqft_living15,sqft_lot15,month,NW,N,NE,SW,S
0,221900.0,3,1.00,1180,5650,1.0,0.0,3,7,1180,0.0,1955,1340,5650,10,1.0,0.0,0.0,0.0,0.0
1,538000.0,3,2.25,2570,7242,2.0,0.0,3,7,2170,400.0,1951,1690,7639,12,1.0,0.0,0.0,0.0,0.0
2,180000.0,2,1.00,770,10000,1.0,0.0,3,6,770,0.0,1933,2720,8062,2,0.0,1.0,0.0,0.0,0.0
3,604000.0,4,3.00,1960,5000,1.0,0.0,5,7,1050,910.0,1965,1360,5000,12,1.0,0.0,0.0,0.0,0.0
4,510000.0,3,2.00,1680,8080,1.0,0.0,3,8,1680,0.0,1987,1800,7503,2,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,360000.0,3,2.50,1530,1131,3.0,0.0,3,8,1530,0.0,2009,1530,1509,5,1.0,0.0,0.0,0.0,0.0
21593,400000.0,4,2.50,2310,5813,2.0,0.0,3,8,2310,0.0,2014,1830,7200,2,1.0,0.0,0.0,0.0,0.0
21594,402101.0,2,0.75,1020,1350,2.0,0.0,3,7,1020,0.0,2009,1020,2007,6,1.0,0.0,0.0,0.0,0.0
21595,400000.0,3,2.50,1600,2388,2.0,0.0,3,8,1600,0.0,2004,1410,1287,1,0.0,0.0,1.0,0.0,0.0


## Create dummy varaibale for catagorical variables

In [19]:
waterfront_dum = pd.get_dummies(df3.waterfront, prefix='waterfront', drop_first=True)
condition_dum = pd.get_dummies(df3.condition, prefix = 'condition', drop_first=True)
grade_dum = pd.get_dummies(df3.grade, prefix = 'grade', drop_first=True)

In [20]:
df3.drop(['waterfront', 'condition', 'grade'], axis=1, inplace= True)

In [21]:
df3 = pd.concat([df3, waterfront_dum, condition_dum, grade_dum], axis=1)

In [22]:
df3 = df3.rename(columns = {'waterfront_1.0':'waterfront_1'})

In [23]:
predictors = df3.drop(['price'], axis= 1)
predictors_int = sm.add_constant(predictors)
model = sm.OLS(df3['price'],predictors_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.71
Model:,OLS,Adj. R-squared:,0.71
Method:,Least Squares,F-statistic:,1934.0
Date:,"Wed, 22 Jul 2020",Prob (F-statistic):,0.0
Time:,16:38:49,Log-Likelihood:,-261040.0
No. Observations:,19769,AIC:,522100.0
Df Residuals:,19743,BIC:,522300.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.75e+06,9.71e+04,38.629,0.000,3.56e+06,3.94e+06
bedrooms,-1.26e+04,1451.942,-8.677,0.000,-1.54e+04,-9752.231
bathrooms,3.316e+04,2352.895,14.092,0.000,2.85e+04,3.78e+04
sqft_living,82.7308,13.211,6.262,0.000,56.837,108.625
sqft_lot,0.5269,0.140,3.770,0.000,0.253,0.801
floors,2.081e+04,2647.503,7.862,0.000,1.56e+04,2.6e+04
sqft_above,12.0168,13.192,0.911,0.362,-13.841,37.875
sqft_basement,16.0881,13.077,1.230,0.219,-9.543,41.719
yr_built,-1972.0621,49.878,-39.538,0.000,-2069.828,-1874.297

0,1,2,3
Omnibus:,5035.907,Durbin-Watson:,1.963
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21214.92
Skew:,1.199,Prob(JB):,0.0
Kurtosis:,7.472,Cond. No.,2040000.0


In [24]:
# Took the catagorical predictors and sepereted them into binary dummy variables

# Drop non significant values

## Make sqft_basement a catagorical value

In [25]:
df4 = df3.copy()

In [26]:
df4.loc[df4['sqft_basement'] > 0, 'basement_present'] = 1

In [27]:
df4 = df4.fillna(0)

## Drop Values

In [28]:
df4.drop(['sqft_above', 'sqft_basement'], axis=1, inplace=True)

In [29]:
predictors = df4.drop(['price'], axis= 1)
predictors_int = sm.add_constant(predictors)
model = sm.OLS(df4['price'],predictors_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.71
Model:,OLS,Adj. R-squared:,0.71
Method:,Least Squares,F-statistic:,2017.0
Date:,"Wed, 22 Jul 2020",Prob (F-statistic):,0.0
Time:,16:38:49,Log-Likelihood:,-261030.0
No. Observations:,19769,AIC:,522100.0
Df Residuals:,19744,BIC:,522300.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.75e+06,9.7e+04,38.641,0.000,3.56e+06,3.94e+06
bedrooms,-1.241e+04,1452.265,-8.548,0.000,-1.53e+04,-9567.097
bathrooms,3.175e+04,2361.490,13.446,0.000,2.71e+04,3.64e+04
sqft_living,93.6927,2.731,34.311,0.000,88.340,99.045
sqft_lot,0.5292,0.140,3.788,0.000,0.255,0.803
floors,2.343e+04,2534.197,9.245,0.000,1.85e+04,2.84e+04
yr_built,-1972.9518,49.861,-39.569,0.000,-2070.684,-1875.220
sqft_living15,50.9278,2.612,19.495,0.000,45.807,56.048
sqft_lot15,-0.9728,0.184,-5.284,0.000,-1.334,-0.612

0,1,2,3
Omnibus:,5052.056,Durbin-Watson:,1.964
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21374.664
Skew:,1.202,Prob(JB):,0.0
Kurtosis:,7.491,Cond. No.,2030000.0


In [30]:
# Turned basement into a a binary variable and dropped any predictor that had 
# a P-value < 0.05. This is the final iteration of the model which showed an 
# ovarall improvement of R2 from the baseline by 0.034

# Sepertate data for training and testing

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict

In [32]:
y = df4['price']
X = predictors

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

linreg = LinearRegression()
linreg.fit(X_train, y_train)

y_hat_train = linreg.predict(X_train)
y_hat_test = linreg.predict(X_test)

mse_train = np.sum((y_train-y_hat_train)**2)/len(y_train)
mse_test =np.sum((y_test-y_hat_test)**2)/len(y_test)
print('Train Mean Squarred Error:', mse_train)
print('Test Mean Squarred Error:', mse_test)

Train Mean Squarred Error: 17204996988.66767
Test Mean Squarred Error: 17380681046.677654
