In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import f_classif
from scipy import stats
import model
import data_clean as dc

In [2]:
data = pd.read_csv('../data/kc_house_data_train.csv')

In [3]:
data = data.drop(columns = 'Unnamed: 0')

data['bathrooms'] = data.bathrooms.apply(lambda x: 1 if x < 1 else x)

data['bedrooms'] = data.bedrooms.apply(lambda x: 3 if x == 33 else x)

data['bedrooms'] = data.bedrooms.apply(lambda x: 1 if x < 1 else x)

data['grade'] = data.grade.apply(lambda x: 4 if x < 4 else x)

data['adjprice'] = dc.price_adjust(data.price)

In [4]:
train_set, test_set = train_test_split(data, test_size=0.25, random_state=99, stratify=data.zipcode)

In [6]:
train_set['lotlog'] = np.log(train_set.sqft_lot)
test_set['lotlog'] = np.log(test_set.sqft_lot)
train_set['abovelog'] = np.log(train_set.sqft_above)
test_set['abovelog'] = np.log(test_set.sqft_above)

zip_dummies = dc.dummy_list(train_set.zipcode, 'zip', 98024)
grade_dummies = dc.dummy_list(train_set.grade, 'grade', 4)
con_dummies = dc.dummy_list(train_set.condition, 'cond', 1)
view_dummies = dc.dummy_list(train_set.view, 'view', 1)
month_dummies = dc.month_dummies(train_set.date)

testzip_dummies = dc.dummy_list(test_set.zipcode, 'zip', 98024)
testgrade_dummies = dc.dummy_list(test_set.grade, 'grade', 4)
testcond_dummies = dc.dummy_list(test_set.condition, 'cond', 1)
testview_dummies = dc.dummy_list(test_set.view, 'view', 1)
testmonth_dummies = dc.month_dummies(test_set.date)

In [10]:
lr = LinearRegression()

y_train = train_set.adjprice

poly_columns = pd.concat([train_set[['sqft_living', 'bedrooms', 'bathrooms', 'abovelog', 'lotlog']], con_dummies, view_dummies, month_dummies], axis = 1)

X_train_poly = model.polynomialize(poly_columns, 2)

X_train_agg = pd.concat([zip_dummies, grade_dummies], axis = 1)

X_train_agg.reset_index(drop = True, inplace = True)

X_train_agg = pd.concat([X_train_agg, X_train_poly], axis = 1)

lr.fit(X_train_agg, y_train)
y_pred = lr.predict(X_train_agg)
r2 = lr.score(X_train_agg, y_train)
rsme = mean_squared_error(y_train, y_pred)
print('R Squared:' + str(r2), 'RSME:' + str(rsme**.5))

R Squared:0.8751784223927895 RSME:101027.90208329496


In [12]:
Y_test = test_set.adjprice

X_test_columns = pd.concat([test_set[['sqft_living', 'bedrooms', 'bathrooms', 'abovelog', 'lotlog']], testcond_dummies, testview_dummies, testmonth_dummies], axis = 1)

X_test_poly = model.polynomialize(X_test_columns, 2)

X_test_agg = pd.concat([testzip_dummies,  testgrade_dummies], axis = 1)

X_test_agg.reset_index(drop = True, inplace = True)

X_test_agg = pd.concat([X_test_agg, X_test_poly], axis = 1)

y_test_pred = lr.predict(X_test_agg)
r2 = lr.score(X_test_agg, Y_test)
rsme = mean_squared_error(Y_test, y_test_pred)
print('R Squared:' + str(r2), 'RSME:' + str((rsme**.5)))

R Squared:0.858694385206752 RSME:106734.89261938006


In [14]:
import pickle

pickle_out = open("model.pickle","wb")
pickle.dump(lr, pickle_out)
pickle_out.close()