This section of code is the modeling.  Run many times with different features and varying the random_state of the train/test/select eventually yielded a somewhat stable model.  Next steps are to collect more training data (500 observations are probably not enough) and perform more category feature engineering on zip_code and / or legal_subdivision as the location seems to have a more variation than a linear latitude / longitude can predict.

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
import seaborn as sns
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import r2_score
from sklearn.linear_model import lars_path
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

Read in dataset and create dummies for categories.

In [2]:
property_df = pd.read_csv('best_final(5).csv')

FileNotFoundError: [Errno 2] No such file or directory: 'best_final(5).csv'

In [None]:
property_df['Price'] = property_df['Price'].apply(lambda col:pd.to_numeric(col, errors='coerce'))

In [None]:
structure_df = pd.get_dummies(property_df['Structure Type'], drop_first=True)
zip_df = pd.get_dummies(property_df['Zip Code'], drop_first=True)

In [None]:
property_df = property_df.join(structure_df)
property_df = property_df.join(zip_df)

Linear Regression, later augmented by polynomials including interaction terms.

In [None]:
property_df_n = property_df[['Zip Code','Interior Row/Townhouse','Single','Twin/Semi-Detached','Longitude', 'Latitude', 'Year Built', 'Fireplaces Count', 'Levels Count', 'Total SqFt',  'Garage Spaces', 'Tax Total Finished SqFt',  'Bedrooms', 'Bathrooms Total', 'Lot SqFt', 'Price', 'Status']]

In [None]:
# Let's jump right in and try a model using all variables above 0.6 correlation

property_df_ntr = property_df_n.loc[property_df_n['Status'].str.contains('Sold|OFF MARKET', case=False)]


X = property_df_ntr[['Zip Code','Longitude', 'Levels Count', 'Total SqFt','Bathrooms Total', 'Interior Row/Townhouse','Single','Twin/Semi-Detached']]
y = property_df_ntr['Price']

poly = PolynomialFeatures(2)
Xp = poly.fit_transform(X)

names = poly.get_feature_names(X.columns)
X2 = pd.DataFrame(Xp, columns = names)
X2.info()

In [None]:
X = X2[['Total SqFt Twin/Semi-Detached','Total SqFt Bathrooms Total','Levels Count Total SqFt','Longitude^2','Bathrooms Total','Total SqFt','Total SqFt Interior Row/Townhouse','Zip Code','Levels Count Twin/Semi-Detached']]

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.20, random_state=70)
X_tr, X_v, y_tr, y_v = train_test_split(X_tr, y_tr, test_size=0.25, random_state=70)

In [None]:
# Instantiate your model
lr = LinearRegression()

# Fit
lr.fit(X_tr,y_tr)

In [None]:
# Print out the coefficients and intercept
print(lr.coef_)
print(lr.intercept_)

Calculate errors and plot results

In [None]:
plt.figure(figsize=[8,5])
plt.title('Target Sales Price Model')
ticks_x = np.linspace(0, 2, 3)
plt.xticks(ticks_x, labels=('Longitude^2','Baths','Levels-SemiDetached'), rotation='vertical', fontsize='10');
plt.legend
plt.bar(height=[25084.4148, 144299.714, 87433.3611],x=['Longitude^2','Baths','Levels-SemiDetached']); 
# # plt.savefig('JAMAICA 179 ST', bbox_inches="tight") 

In [None]:
# Now score your model with R^2
print(lr.score(X_tr, y_tr))

In [None]:
# Now score your model with R^2
print(lr.score(X_v, y_v))

In [None]:
## This step fits the Standard Scaler to the training data
## Essentially it finds the mean and standard deviation of each variable in the training set

std = StandardScaler()
std.fit(X_tr)

In [None]:
## This step applies the scaler to the train set.
## It subtracts the mean it learned in the previous step and then divides by the standard deviation

X_trs = std.transform(X_tr)

In [None]:
lasso_model = Lasso(alpha = 10000)
lasso_model.fit(X_trs,y_tr)

In [None]:
# These are the (standardized) coefficients found
# when it refit using that best alpha

list(zip(X_tr, lasso_model.coef_))

In [None]:
preds_v = lr.predict(X_v) # generate predictions (on training data) using fit model
preds_tr = lr.predict(X_tr) 
preds_te = lr.predict(X_te)
sns.jointplot(x=preds_v,y=y_v, kind='reg')

In [None]:
sns.jointplot(x=preds_tr,y=y_tr, kind='reg')

In [None]:
print(np.mean(np.abs(y_tr-preds_tr)))

In [None]:
print(np.mean(np.abs(y_v-preds_v)))

In [None]:
print(np.mean(np.abs(y_te-preds_te)))

In [None]:
sns.jointplot(x=preds_te,y=y_te, kind='reg')

In [None]:
print(lr.score(X_tr, y_tr))
print(lr.score(X_v, y_v))
print(lr.score(X_te, y_te))

Run the prediction dataset through the Sales Price Model to get the Target Price.

In [None]:
property_df_predict = property_df.loc[property_df_n['Status'].str.contains('Contingent|Contract|For Sale', case=False)]
Xpr = property_df_predict[['Zip Code','Longitude', 'Levels Count', 'Total SqFt','Bathrooms Total', 'Interior Row/Townhouse','Single','Twin/Semi-Detached']]
y_price = property_df_predict['Price']

poly = PolynomialFeatures(2)
Xprp = poly.fit_transform(Xpr)

names = poly.get_feature_names(Xpr.columns)
X2pr = pd.DataFrame(Xprp, columns = names)
X_predict = X2pr[['Total SqFt Twin/Semi-Detached','Total SqFt Bathrooms Total','Levels Count Total SqFt','Longitude^2','Bathrooms Total','Total SqFt','Total SqFt Interior Row/Townhouse','Zip Code','Levels Count Twin/Semi-Detached']]

In [None]:
y_predict = lr.predict(X_predict)

In [None]:
sns.jointplot(x=y_predict,y=y_price, kind='reg')

In [None]:
property_df_predict_price = property_df_predict
property_df_predict_price['Target Price'] = y_predict
property_df_predict_price.to_csv('predictions.csv')