In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from stargazer.stargazer import Stargazer


In [2]:
# Bring in the data
df = pd.read_csv('../data/HuntersGreenHomeSales_prepped.csv')

# Have a peak
df.head()


Unnamed: 0,slnoskm,Status,Address,Street_only,lat,long,Beds,bathsfull,bathshalf,bathstotal,...,datesold,ds_dom,ds_moy,ds_year,splsale,splsal_Auction,splsal_shortsale,splsale_BankOwnedREO,pricesold,days_on_market
0,1,Sold,17711 ESPRIT DR,ESPRIT DR,28.127921,-82.340184,3,2,0,2,...,10/6/2018,6,10,2018,Short Sale,0,1,0,141500,451
1,2,Sold,17769 ESPRIT DR,ESPRIT DR,28.130015,-82.339863,2,2,0,2,...,7/30/2015,30,7,2015,,0,0,0,160000,28
2,3,Sold,17622 ESPRIT DR,ESPRIT DR,28.129953,-82.341055,3,2,0,2,...,7/29/2015,29,7,2015,,0,0,0,161500,42
3,4,Sold,18111 ASHTON PARK WAY,ASHTON PARK WAY,28.138067,-82.333513,3,2,0,2,...,7/29/2016,29,7,2016,,0,0,0,170000,35
4,5,Sold,9203 CELEBRATION CT,CELEBRATION CT,28.129131,-82.340659,4,2,1,3,...,9/30/2016,30,9,2016,Short Sale,0,1,0,170000,417


In [3]:
# show me null data
df.isnull().sum()

slnoskm                     0
Status                      0
Address                     0
Street_only                 0
lat                         0
long                        0
Beds                        0
bathsfull                   0
bathshalf                   0
bathstotal                  0
sqft                        0
garages                     4
roof_Built-Up               0
roof_Concrete               0
roof_Shake                  0
roof_Shingle                0
roof_Slate                  0
roof_Tile                   0
roof_Other                  0
lotsqft                     0
yrblt                       0
pool_Community              0
pool_Private                0
pool_None                   0
spa                       313
subdivn                     0
adom_agentdaysonmarket      0
cdom_cumuldaysmls           0
listprice                   0
lppersqft                   0
PendingDate                 0
pd_dom                      0
pd_moy                      0
pd_year   

Let's drop the spa splsale columns before we drop null rows so we don't lose a ton of data.  splsale has been recaptured by breakout columns.

In [4]:
df = df.drop(['spa', 'splsale'], axis=1)
df = df.dropna()

In [5]:
df.describe()


Unnamed: 0,slnoskm,lat,long,Beds,bathsfull,bathshalf,bathstotal,sqft,garages,roof_Built-Up,...,pd_year,sppersqft,ds_dom,ds_moy,ds_year,splsal_Auction,splsal_shortsale,splsale_BankOwnedREO,pricesold,days_on_market
count,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,...,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0
mean,241.834728,28.131651,-82.34258,3.864017,2.656904,0.341004,2.997908,2639.857741,2.493724,0.002092,...,2016.930962,129.312594,17.543933,6.675732,2016.995816,0.006276,0.016736,0.041841,346653.2,42.104603
std,139.491364,0.004647,0.008928,0.703631,0.687919,0.496141,0.830496,827.257195,0.552261,0.045739,...,1.375681,21.024089,9.243342,3.10634,1.364411,0.079056,0.128416,0.200435,152369.6,32.869055
min,1.0,28.120558,-82.361649,2.0,2.0,0.0,2.0,1305.0,1.0,0.0,...,2015.0,73.85,1.0,1.0,2015.0,0.0,0.0,0.0,141500.0,0.0
25%,121.25,28.128696,-82.349495,3.0,2.0,0.0,3.0,2079.0,2.0,0.0,...,2016.0,114.9625,10.0,4.0,2016.0,0.0,0.0,0.0,247925.0,30.0
50%,242.5,28.130965,-82.340279,4.0,3.0,0.0,3.0,2529.0,2.0,0.0,...,2017.0,129.595,17.5,7.0,2017.0,0.0,0.0,0.0,327750.0,38.0
75%,361.75,28.13479,-82.33595,4.0,3.0,1.0,3.0,3009.5,3.0,0.0,...,2018.0,142.2275,26.0,9.0,2018.0,0.0,0.0,0.0,396500.0,48.0
max,482.0,28.141323,-82.327339,6.0,6.0,2.0,8.0,8398.0,5.0,1.0,...,2019.0,218.16,31.0,12.0,2020.0,1.0,1.0,1.0,1435000.0,451.0


In [6]:
# Look at normality
# plt.figure(figsize=(8, 5))
# sns.scatterplot(x=df['sqft'], y=df['pricesold'])
# plt.show()


In [7]:
df['log_sqft'] = np.log1p(df['sqft'])

In [8]:
model1 = smf.ols(formula='pricesold ~ long + yrblt + splsal_shortsale + pool_None', data=df).fit()

print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:              pricesold   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     60.48
Date:                Wed, 26 Feb 2025   Prob (F-statistic):           3.04e-41
Time:                        19:41:52   Log-Likelihood:                -6283.5
No. Observations:                 478   AIC:                         1.258e+04
Df Residuals:                     473   BIC:                         1.260e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -6.283e+08    6.1e+07  

In [9]:
model2 = smf.ols(formula='pricesold ~ long + yrblt + splsal_shortsale + pool_None', data=df).fit()

print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:              pricesold   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     60.48
Date:                Wed, 26 Feb 2025   Prob (F-statistic):           3.04e-41
Time:                        19:41:52   Log-Likelihood:                -6283.5
No. Observations:                 478   AIC:                         1.258e+04
Df Residuals:                     473   BIC:                         1.260e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -6.283e+08    6.1e+07  

In [10]:
model3 = smf.ols(formula='pricesold ~ long + yrblt + splsal_shortsale + pool_None', data=df).fit()

print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:              pricesold   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     60.48
Date:                Wed, 26 Feb 2025   Prob (F-statistic):           3.04e-41
Time:                        19:41:52   Log-Likelihood:                -6283.5
No. Observations:                 478   AIC:                         1.258e+04
Df Residuals:                     473   BIC:                         1.260e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -6.283e+08    6.1e+07  