In [1]:
# Import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import seaborn as sns

# Import scikit learn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
data = pd.read_csv('data_cleaned.csv')

In [3]:
# Overview of final data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  21597 non-null  int64  
 1   date                21597 non-null  object 
 2   price               21597 non-null  float64
 3   bedrooms            21597 non-null  int64  
 4   bathrooms           21597 non-null  float64
 5   sqft_living         21597 non-null  int64  
 6   sqft_lot            21597 non-null  int64  
 7   floors              21597 non-null  float64
 8   waterfront          21597 non-null  float64
 9   view                21597 non-null  float64
 10  condition           21597 non-null  int64  
 11  grade               21597 non-null  int64  
 12  sqft_above          21597 non-null  int64  
 13  sqft_basement       21597 non-null  float64
 14  yr_built            21597 non-null  int64  
 15  yr_renovated        21597 non-null  int64  
 16  zipc

## First Shitty Model

In [33]:
feats_in_control = ['price','bedrooms', 'bathrooms', 'sqft_living', 'floors', 'condition', 'grade', 'grade_category',
                    'sqft_above', 'sqft_basement', 'has_been_viewed', 'has_basement', 'has_been_renovated']

In [34]:
# Define X and y
X_cols = [c for c in data[feats_in_control].columns.to_list() if c not in ['price', 'grade_category']]

X = data[X_cols]
y = data['price']

In [35]:
# Perform a train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [36]:
# Instantiate a scaler
scaler = StandardScaler()

In [37]:
# Train our scaler on training data, then fit to testing
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [38]:
# Instantiate a linear regression model
lr = LinearRegression()

In [39]:
# Fit our model on our scaled data
lr.fit(X_train_scaled, y_train)

LinearRegression()

In [40]:
# Evaluate
y_train_pred = lr.predict(X_train_scaled)
y_test_pred = lr.predict(X_test_scaled)

print("Training Scores:")
print(f"R2: {r2_score(y_train, y_train_pred)}") #can account for X amount of variance
print(f"Mean Absolute Error: {mean_absolute_error(y_train, y_train_pred)}") #about X amount off in predicting price
print("---")
print("Testing Scores:")
print(f"R2: {r2_score(y_test, y_test_pred)}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, y_test_pred)}")
print(f"Root Mean Squared Error: {np.sqrt(mean_absolute_error(y_test, y_test_pred))}")

Training Scores:
R2: 0.5740371260593715
Mean Absolute Error: 156829.740671793
---
Testing Scores:
R2: 0.5531210254966812
Mean Absolute Error: 155365.44598186543
Root Mean Squared Error: 394.16423731975664


In [41]:
data[feats_in_control]

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,condition,grade,grade_category,sqft_above,sqft_basement,has_been_viewed,has_basement,has_been_renovated
0,325000.0,4,2.50,2240,2.0,4,8,Average,2240,0.0,1,0,0
1,540000.0,3,1.75,1630,2.0,3,8,Average,1020,610.0,1,1,0
2,344000.0,1,1.50,760,3.0,3,8,Average,760,0.0,1,0,0
3,565000.0,4,2.50,2500,1.0,3,9,Average,2500,0.0,1,0,0
4,320000.0,3,2.00,1250,1.0,5,7,Average,1250,0.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,279000.0,3,1.00,1530,1.0,3,7,Average,970,560.0,1,1,0
21593,456000.0,3,2.50,2130,2.0,3,8,Average,2130,0.0,1,0,0
21594,280000.0,2,1.00,1260,1.0,3,6,Average,1100,160.0,1,1,0
21595,359950.0,3,1.75,1570,1.0,3,7,Average,1040,530.0,1,1,0


In [42]:
data[feats_in_control].corr().price.sort_values(ascending=False)

price                 1.000000
sqft_living           0.701917
grade                 0.667951
sqft_above            0.605368
bathrooms             0.525906
sqft_basement         0.322192
bedrooms              0.308787
floors                0.256804
has_basement          0.178264
has_been_renovated    0.117543
condition             0.036056
has_been_viewed      -0.012024
Name: price, dtype: float64

## Now try with statsmodels - OHE

In [43]:
model = sm.OLS(y, sm.add_constant(X))

In [44]:
results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.569
Model:,OLS,Adj. R-squared:,0.569
Method:,Least Squares,F-statistic:,2594.0
Date:,"Tue, 12 Jan 2021",Prob (F-statistic):,0.0
Time:,09:27:58,Log-Likelihood:,-298290.0
No. Observations:,21597,AIC:,596600.0
Df Residuals:,21585,BIC:,596700.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.657e+05,3.55e+04,-21.589,0.000,-8.35e+05,-6.96e+05
bedrooms,-4.305e+04,2246.246,-19.164,0.000,-4.75e+04,-3.86e+04
bathrooms,-2.123e+04,3678.188,-5.771,0.000,-2.84e+04,-1.4e+04
sqft_living,169.5554,25.379,6.681,0.000,119.810,219.301
floors,2208.1053,4109.824,0.537,0.591,-5847.454,1.03e+04
condition,6.259e+04,2655.114,23.572,0.000,5.74e+04,6.78e+04
grade,1.124e+05,2375.633,47.296,0.000,1.08e+05,1.17e+05
sqft_above,31.0099,25.367,1.222,0.222,-18.711,80.730
sqft_basement,66.1367,25.991,2.545,0.011,15.193,117.081

0,1,2,3
Omnibus:,16818.21,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1004862.557
Skew:,3.256,Prob(JB):,0.0
Kurtosis:,35.776,Cond. No.,82700.0
