In [92]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

df = pd.read_csv('data/kc_house_data.csv')

In [93]:
# make subsets of the columns
num_cont = ['sqft_living', 'sqft_lot', 'sqft_basement', 'sqft_patio']
num_disc = ['bedrooms', 'floors', 'age']
cat = ['garage', 'waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade', 'heat_source']
ignore = ['id', 'date', 'lat', 'long', 'address']
log_vars = ['price', 'sqft_lot']

In [94]:
df['sale_year'] = df['date'].apply(lambda x: int(x[-4:]))
df['age'] = df['sale_year'] - df['yr_built']
df['yrs_since_reno'] = df.apply(lambda x: x.sale_year - x.yr_renovated if x.yr_renovated > 0 else x.age, axis=1)
df['garage'] = df['sqft_garage'].apply(lambda x: 1 if x > 0 else 0)

df = df[['price',
         'sqft_living', 'sqft_lot', 'sqft_basement', 'sqft_patio',
        'bedrooms', 'floors', 'age',
         'garage', 'waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade']]

In [95]:
for col in log_vars:
    df[col] = np.log(df[col])

In [96]:
for col in num_cont:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

for col in num_disc:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

In [97]:
# collect a list of outlier data past a certain threshold of standard deviations

threshold = 7.5
outliers = set()
for col in num_cont:
    outliers = outliers.union(set(df[df[col] > threshold].index))
    
df.drop(outliers, inplace=True)

In [100]:
df = pd.get_dummies(df, columns=['waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade'])
df = df.drop(['waterfront_NO', 'greenbelt_NO', 'nuisance_NO', 'view_NONE', 'condition_Average', 'grade_7 Average'], axis=1)

In [108]:
y = df['price']
X = df.drop(columns=['price'])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.495
Model:                            OLS   Adj. R-squared:                  0.494
Method:                 Least Squares   F-statistic:                     951.0
Date:                Tue, 21 Feb 2023   Prob (F-statistic):               0.00
Time:                        09:49:50   Log-Likelihood:                -16102.
No. Observations:               30130   AIC:                         3.227e+04
Df Residuals:                   30098   BIC:                         3.253e+04
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  13.5522    

In [109]:
y = df['price']
X = df['sqft_living']

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.387
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                 1.899e+04
Date:                Tue, 21 Feb 2023   Prob (F-statistic):               0.00
Time:                        09:51:52   Log-Likelihood:                -19025.
No. Observations:               30130   AIC:                         3.805e+04
Df Residuals:                   30128   BIC:                         3.807e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          13.7354      0.003   5240.011      

In [114]:
y = df['price']
X = df.drop(columns=['price', 'garage', 'grade_1 Cabin', 'grade_2 Substandard'])

model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.495
Model:                            OLS   Adj. R-squared:                  0.494
Method:                 Least Squares   F-statistic:                     1053.
Date:                Tue, 21 Feb 2023   Prob (F-statistic):               0.00
Time:                        09:55:07   Log-Likelihood:                -16105.
No. Observations:               30130   AIC:                         3.227e+04
Df Residuals:                   30101   BIC:                         3.251e+04
Df Model:                          28                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  13.5455    

At a glance, it seems like the heat source stuff is the worst. Bottom two grades also don't matter. Years since renovation is trash.