In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

df = pd.read_csv('data/kc_house_data.csv')

# feature creation

In [2]:
df['city'] = df['address'].apply(lambda x: str(x).split(' ')[-5].replace(',', ''))

In [3]:
df['state'] = df['address'].apply(lambda x: str(x).split(' ')[-4])

In [4]:
df['zip'] = df['address'].apply(lambda x: str(x).split(' ')[-3].replace(',', ''))

In [6]:
df.drop(df[df['state'] != 'Washington'].index, inplace=True)

In [None]:
df.columns

# make a baseline model as simply as possible

sort columns into groups

In [7]:
numeric_continuous = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_garage', 'sqft_patio']
numeric_discrete = ['bedrooms', 'bathrooms', 'floors', 'yr_built', 'yr_renovated']
categorical = ['waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade', 'heat_source', 'sewer_system']
specials = ['date', 'address', 'lat', 'long']

create dataframes from groups

shift the mean to zero for continuous variables

one-hot encode categorical variables

In [8]:
df_numeric_continuous = df[numeric_continuous].copy()
for column in numeric_continuous:
    df_numeric_continuous[column] = df_numeric_continuous[column] - df_numeric_continuous[column].mean()

df_numeric_discrete = df[numeric_discrete].copy()

df_categorical = df[categorical].copy()
df_categorical = pd.get_dummies(df_categorical, columns = categorical)
df_categorical.drop(columns = ['waterfront_NO', 'greenbelt_NO', 'nuisance_NO',
                               'view_NONE', 'condition_Average', 'grade_7 Average',
                               'heat_source_Other', 'sewer_system_PUBLIC'], axis = 1, inplace = True)

run the model

In [9]:
y = df['price']
X_baseline = pd.concat([df_numeric_continuous, df_numeric_discrete, df_categorical], axis = 1)

baseline_model = sm.OLS(y, sm.add_constant(X_baseline))
baseline_results = baseline_model.fit()

print(baseline_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.542
Model:                            OLS   Adj. R-squared:                  0.541
Method:                 Least Squares   F-statistic:                     804.1
Date:                Wed, 22 Feb 2023   Prob (F-statistic):               0.00
Time:                        18:33:58   Log-Likelihood:            -4.3087e+05
No. Observations:               29245   AIC:                         8.618e+05
Df Residuals:                   29201   BIC:                         8.622e+05
Df Model:                          43                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

## comment on the baseline model

In [10]:
# collect a list of outlier data past a certain threshold of standard deviations

threshold = 6
outliers = set()

outliers = set(df[df['price'] > df['price'].mean() + threshold * df['price'].std()].index)

len(outliers)

99

In [11]:
y = df['price'].drop(outliers)
X_baseline = pd.concat([df_numeric_continuous, df_numeric_discrete, df_categorical], axis = 1).drop(outliers)

baseline_model = sm.OLS(y, sm.add_constant(X_baseline))
baseline_results = baseline_model.fit()

print(baseline_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.570
Model:                            OLS   Adj. R-squared:                  0.569
Method:                 Least Squares   F-statistic:                     897.3
Date:                Wed, 22 Feb 2023   Prob (F-statistic):               0.00
Time:                        18:34:07   Log-Likelihood:            -4.2191e+05
No. Observations:               29146   AIC:                         8.439e+05
Df Residuals:                   29102   BIC:                         8.443e+05
Df Model:                          43                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

# make basic adjustments

customize one-hot encoding for heat source, given that some entries have multiple heat sources.

In [12]:
df['gas'] = df['heat_source'].apply(lambda x: 1 if str(x)[:3] == 'Gas' else 0)
df['electricity'] = df['heat_source'].apply(lambda x: 1 if str(x)[:3] == 'Ele' else 0)
df['oil'] = df['heat_source'].apply(lambda x: 1 if str(x)[:3] == 'Oil' else 0)
df['solar'] = df['heat_source'].apply(lambda x: 1 if str(x)[-5:] == 'Solar' else 0)
del df['heat_source']

customize label-encoding for view, condition, and grade

In [13]:
def view(x):
    if x == 'NONE':
        return 0
    elif x == 'FAIR':
        return 1
    elif x == 'AVERAGE':
        return 2
    elif x == 'GOOD':
        return 3
    elif x == 'EXCELLENT':
        return 4

df['view'] = df['view'].apply(lambda x: view(x))

def condition(x):
    if x == 'Poor':
        return 0
    elif x == 'Fair':
        return 1
    elif x == 'Average':
        return 2
    elif x == 'Good':
        return 3
    elif x == 'Very Good':
        return 4

df['condition'] = df['condition'].apply(lambda x: condition(x))

df['grade'] = df['grade'].apply(lambda x: int(x[:2].strip()))

reset the groups

In [14]:
numeric_continuous = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_garage', 'sqft_patio']
numeric_discrete = ['bedrooms', 'bathrooms', 'floors', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated']
categorical = ['waterfront', 'greenbelt', 'nuisance', 'sewer_system']
heat = ['gas', 'electricity', 'oil', 'solar']
specials = ['date', 'address', 'lat', 'long']

create dataframes from groups

shift the mean to zero for continuous variables

one-hot encode categorical variables

In [15]:
df_numeric_continuous = df[numeric_continuous].copy()
for column in numeric_continuous:
    df_numeric_continuous[column] = df_numeric_continuous[column] - df_numeric_continuous[column].mean()

df_numeric_discrete = df[numeric_discrete].copy()

df_categorical = df[categorical].copy()
df_categorical = pd.get_dummies(df_categorical, columns = categorical)
df_categorical.drop(columns = ['waterfront_NO', 'greenbelt_NO',
                               'nuisance_NO', 'sewer_system_PUBLIC'], axis = 1, inplace = True)

df_heat = df[heat].copy()

run the model

In [16]:
y = df['price']
X_baseline = pd.concat([df_numeric_continuous, df_numeric_discrete, df_categorical, df_heat], axis = 1)

baseline_model = sm.OLS(y, sm.add_constant(X_baseline))
baseline_results = baseline_model.fit()

print(baseline_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.504
Model:                            OLS   Adj. R-squared:                  0.504
Method:                 Least Squares   F-statistic:                     1238.
Date:                Wed, 22 Feb 2023   Prob (F-statistic):               0.00
Time:                        18:34:16   Log-Likelihood:            -4.3204e+05
No. Observations:               29245   AIC:                         8.641e+05
Df Residuals:                   29220   BIC:                         8.643e+05
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

removing bad correlations: greenbelt, heat source, (some) sewers

In [17]:
del df_categorical['greenbelt_YES']
del df_categorical['sewer_system_PRIVATE RESTRICTED']
del df_categorical['sewer_system_PUBLIC RESTRICTED']

run the model

In [18]:
y = df['price']
X_baseline = pd.concat([df_numeric_continuous, df_numeric_discrete, df_categorical, df_heat], axis = 1)

baseline_model = sm.OLS(y, sm.add_constant(X_baseline))
baseline_results = baseline_model.fit()

print(baseline_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.504
Model:                            OLS   Adj. R-squared:                  0.504
Method:                 Least Squares   F-statistic:                     1415.
Date:                Wed, 22 Feb 2023   Prob (F-statistic):               0.00
Time:                        18:34:22   Log-Likelihood:            -4.3204e+05
No. Observations:               29245   AIC:                         8.641e+05
Df Residuals:                   29223   BIC:                         8.643e+05
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  7.84e+06 

removing outliers

In [19]:
# collect a list of outlier data past a certain threshold of standard deviations

threshold = 6
outliers = set()

outliers = set(df[df['price'] > df['price'].mean() + threshold * df['price'].std()].index)
    
len(outliers)

99

run the model

In [20]:
y = df['price'].drop(outliers)
X_baseline = pd.concat([df_numeric_continuous, df_numeric_discrete, df_categorical, df_heat], axis = 1).drop(outliers)

baseline_model = sm.OLS(y, sm.add_constant(X_baseline))
baseline_results = baseline_model.fit()

print(baseline_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.545
Model:                            OLS   Adj. R-squared:                  0.545
Method:                 Least Squares   F-statistic:                     1662.
Date:                Wed, 22 Feb 2023   Prob (F-statistic):               0.00
Time:                        18:34:28   Log-Likelihood:            -4.2273e+05
No. Observations:               29146   AIC:                         8.455e+05
Df Residuals:                   29124   BIC:                         8.457e+05
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 7.779e+06 

removing outliers from the numeric variables doesn't seem to change R-squared much at all (at best, improves from 0.490 to 0.491 without changing Adj. R-squared) but removing about 100 outliers (6 stds) improves R-squared to 0.546

In [None]:
str(df['address'][2354]).replace(',', '').split(' ')[-5]

In [None]:
df['city'].value_counts()

In [None]:
df['zip_code'] = df['address'].apply(lambda x: str(x)[-20:-15])

In [None]:
df[df['zip_code'] == '01541']

In [None]:
df.groupby('zip_code').mean()['price']

In [None]:
df['zip_code'].value_counts()[df['zip_code'].value_counts() > 400]

https://maps.googleapis.com/maps/api/geocode/json?latlng=40.714224,-73.961452&key=AIzaSyCN7-ghSmGO3PDhmQ7HenEBWyWFYBeiA0Q