# Package & Data Imports

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import pickle

import warnings
warnings.simplefilter("ignore")

In [20]:
df = pd.read_csv('../data/train_clean.csv')
print(df.shape)
df.head()

(2031, 265)


Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Misc Feature_Shed,Misc Feature_TenC,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170,60,79.5,9.511703,6,8,1976,2005,289.0,...,0,0,0,0,0,0,0,0,0,1
1,544,531379050,60,43.0,9.349406,7,5,1996,1997,132.0,...,0,0,0,0,0,0,0,0,0,1
2,153,535304180,20,68.0,8.977399,5,7,1953,2007,0.0,...,0,0,0,0,0,0,0,0,0,1
3,318,916386060,60,73.0,9.190342,5,5,2006,2007,0.0,...,0,0,0,0,0,0,0,0,0,1
4,255,906425045,50,82.0,9.563459,6,8,1900,1993,0.0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
# Create features and target
X = df.drop(columns = ['Id', 'PID', 'SalePrice', 'SalePriceLog'])
y = df['SalePrice']

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Scaling had poor test model performance so did not use

To determine which variables might be best for a deployed model where inputs may be played with, we'll do a OLS linear regression model to determine the larger coefficient variables. 

In [22]:
# Instantiate and fit our model
lr = LinearRegression()

lr.fit(X_train, y_train)
print(f'Train R^2 score: {lr.score(X_train, y_train)}')
print(f'Test R^2 score: {lr.score(X_test, y_test)}')

Train R^2 score: 0.9443347922641199
Test R^2 score: 0.9285722523160042


Here we determine the top 10 coefficients to use in a simplified model, for the sake of dashboarding the model and allowing inputs to be manipulated to predict home prices. 

In [23]:
lr_coef_df = pd.DataFrame(list(zip(X.columns, lr.coef_)), columns=['feature', 'coef'])
lr_coef_df.sort_values(by = 'coef', ascending = False, inplace = True)

In [24]:
lr_coef_df.head(10)

Unnamed: 0,feature,coef
237,Garage Cond_Po,122647.093434
238,Garage Cond_TA,120008.30466
234,Garage Cond_Fa,115700.14138
235,Garage Cond_Gd,115399.883868
15,Gr Liv Area,70001.943294
70,Neighborhood_GrnHill,65973.834601
122,Roof Matl_WdShngl,56736.420495
12,1st Flr SF,53593.823222
98,Condition 2_PosA,38043.631505
254,Sale Type_Con,33084.741702


In [25]:
# save top 10 coefficients for simplified model
lr_top_coef = lr_coef_df[:10]

In [26]:
# Create features and target
features = list(lr_top_coef['feature'])
X2 = df[features]
y2 = df['SalePrice']

# train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state = 42)

# Instantiate and fit our model
lr2 = LinearRegression()

lr2.fit(X_train2, y_train2)
print(f'Train R^2 score: {lr2.score(X_train2, y_train2)}')
print(f'Test R^2 score: {lr2.score(X_test2, y_test2)}')

Train R^2 score: 0.5852075253392794
Test R^2 score: 0.5998663209050857


The model is not great but the focus for this project is more on the deployment and dashboarding.

In [27]:
with open('../models/lr_model.pkl', 'wb') as f:
    pickle.dump(lr2, f)