In [1]:
from sqlalchemy import create_engine
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
import math
from sklearn.feature_selection import RFE 
from sklearn.linear_model import LinearRegression
import numpy as np
# from src.utilities.aa_kc_housing import *


In [2]:
def pullsqldata():
    """This function pulls the necessary columns and rows from the PostGRES DB into a Pandas Dataframe in order 
    to continue with our EDA """
    
    engine = create_engine("postgresql:///kc_housing")
    query = """
                SELECT *
                FROM rpsale AS s
                INNER JOIN resbldg AS b ON CONCAT(s.Major,s.Minor) = CONCAT(b.Major, b.Minor)
                INNER JOIN parcel AS p ON CONCAT(s.Major,s.Minor) = CONCAT(p.Major,p.Minor)
                WHERE EXTRACT(YEAR FROM CAST(documentdate AS DATE)) = 2018
                    AND p.proptype = 'R'
                ;"""
    kc_df = pd.read_sql(sql = query, con = engine)
    return kc_df
    

In [3]:
def clean_data_intial(df):
    """ This function cleans the housing data by removing outliers, sale price == 0, and irrelevant columns. 
    """
    #We chose a minimum sale vale of 10000 and a maximium sale value of 2 sigma
    df_clean = df[(df['saleprice']>100000) & (df['saleprice'] <  (2*df['saleprice'].std())+df['saleprice'].mean())]
    df_clean = df_clean[df_clean['sqftlot'] <  (2*df_clean['sqftlot'].std())+df_clean['sqftlot'].mean()]
    #These are irrelevant or highly covariant columns
    columns_to_drop = ['documentdate',
                       'excisetaxnbr',
                       'recordingnbr',
                       'volume',
                       'page',
                       'platnbr',
                       'plattype',
                       'platlot',
                       'platblock',
                        'sellername',
                        'buyername',
                        'streetname',
                        'streettype',
                        'directionsuffix',
                        'buildingnumber',
                        'major',
                        'minor',
                        'bldggradevar',
                        'sqfthalffloor',
                        'sqft2ndfloor',
                        'sqftupperfloor',
                        'sqftunfinfull',
                        'sqftunfinhalf',
                        'sqfttotbasement',
                        'sqftfinbasement',
                        'brickstone',
                        'viewutilization',
                        'propname',
                        'platname',
                        'platlot',
                        'platblock',
                        'range',
                        'township',
                        'section',
                        'quartersection',
                        'area',
                        'subarea',
                        'specarea',
                        'specsubarea',
                        'levycode',
                        'districtname',
                        'currentzoning',
                        'topography',
                        'currentusedesignation',
                        'salewarning',
                        'wetland',
                        'stream',
                        'seismichazard',
                        'landslidehazard',
                        'address',
                        'airportnoise',
                        'contamination',
                        'dnrlease',
                         'coalminehazard',
                         'criticaldrainage',
                         'erosionhazard',
                         'landfillbuffer',
                         'hundredyrfloodplain',
                         'steepslopehazard',
                         'speciesofconcern',
                         'sensitiveareatract',
                         'daylightbasement',
                         'fraction',
                        'directionprefix', 'proptype','unbuildable', 'bldgnbr', 'pcntcomplete']
    df_clean.drop(columns=columns_to_drop, inplace = True)
    #The columns with Y or N need to be 1 or 0 to model
    df_clean['othernuisances'] = [i.strip() for i in df_clean['othernuisances']]
    df_clean.replace(('Y', 'N'), (1, 0), inplace=True)
    
    #To model the houses that take up more space of thier plot (smaller yard) we need a ratio feature
    #We assume an acturate metric of the house's footprint is the first floor plus any attached garage. This 
    #unfortunatley may not account for detached garages
    df_clean['footprint_ratio']=(df_clean['sqft1stfloor']+df_clean['sqftgarageattached'])/df_clean['sqftlot']
    df_clean.drop(columns = 'sqft1stfloor', inplace = True)
    
    #nbrliving units is classified data telling us if it is a duplex. We want to remove triplexes and create a duplex 
    #flag column
    
    triplex = df_clean.loc[df_clean['nbrlivingunits'] == 3]
    df_clean.drop(triplex.index, inplace= True, axis=0)
    df_clean['duplex'] = df_clean['nbrlivingunits'] - 1
    df_clean.drop(columns = 'nbrlivingunits', inplace = True)

    return df_clean
    

In [4]:
def recursive_feature_selection(n_features,indep_variables_df, dep_var):
    """
    n_features = number of features to select
    indep_variables = pandas dataframe containing the features to select from
    dep_var = pandas dataframe containing the feature to model \
    returns a list of features to include in model to best fit line
    """
    lr = LinearRegression()
    select = RFE(lr, n_features_to_select=n_features)
    select = select.fit(indep_variables_df, y= dep_var.values.ravel())
    selected_columns = indep_variables_df.columns[select.support_]
    return selected_columns

In [5]:
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [12]:
def zip_code_df(df):
    """
    This function produces a tuple with tuple[0] as a df with the one hot encoded zip code features and tuple[1] as 
    the list of zip code column names. 
    
    The df input should be the dataframe that is output by the "clean_data_initial" function (not a dataframe that 
    the "saleprice" column has been removed from.. this is because we drop rows that do not have a zipcode so we need to 
    keep the shape of the dependent and independent variable dataframes equal). 
    
    """
    #drop the sales that do not include a zip code. We use '98' here to find king county specific zip codes and 
    #we select only the first 5 digits of the zip code because some sales' zip codes have an extraneious 4 digits
    dropped_rows = df[df['zipcode'].str.contains ('98')]
    dropped_rows['zipcode'] = dropped_rows['zipcode'].map(lambda x: x[0:5])

    #use pd.Categorical and pd.get_dummies methods to one hot encode the zip codes
    dropped_rows['zipcode'] = pd.Categorical(dropped_rows['zipcode'])
    df_zip = pd.get_dummies(dropped_rows['zipcode'], prefix = 'zip')
    
    #drop one column from the zip code columns to address the inherent multicoliniearity
    df_zip.drop(columns = 'zip_98000', inplace = True) 
    
    #get a list of zipcode column names to include in model
    list_of_zips = df_zip.columns
    
    #join the zip code dataframe to the dataframe with the other predicitive features
    df_with_zip_cols = dropped_rows.join(df_zip, how = 'inner')
    df_with_zip_cols = df_with_zip_cols.drop(['zipcode'], axis=1)
    
    
    return df_with_zip_cols, list_of_zips

def make_zipcode_model(df_clean, list_of_baseline_features):
    #call zip_code_df function to produce zip code df and list of zipcodes
    zip_tuple = zip_code_df(df_clean)

    #add on total bath colum using previously used function
    df = engineer_total_baths(zip_tuple[0])
    
    #add on list of other baseline features to the zip code list to put into model
    list_of_features = list(zip_tuple[1])
    list_of_features.extend(list_of_baseline_features)
    
    #produce the model
    
    return make_housing_model(list_of_features, df, df['saleprice'])

In [6]:
df = pullsqldata()

In [7]:
df_clean = clean_data_intial(df)

## aaron stuff here

In [50]:
new_model = df_clean[['saleprice','sqfttotliving', 'sqftlot', 'footprint_ratio', 'duplex', 'wfntlocation', 'lakesammamish', 'seattleskyline', 'olympics', 'mtrainier', 'streetsurface', 'heatsource', 'zipcode']]

dropped_rows = new_model[new_model['zipcode'].str.contains ('98')]
dropped_rows['zipcode'] = dropped_rows['zipcode'].map(lambda x: x[0:5])

    #use pd.Categorical and pd.get_dummies methods to one hot encode the zip codes
dropped_rows['zipcode'] = pd.Categorical(dropped_rows['zipcode'])
df_zip = pd.get_dummies(dropped_rows['zipcode'], prefix = 'zip')
    
    #drop one column from the zip code columns to address the inherent multicoliniearity
df_zip.drop(columns = 'zip_98000', inplace = True)

df_with_zip_cols = dropped_rows.join(df_zip, how = 'inner')
df_with_zip_cols = df_with_zip_cols.drop(['zipcode'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [53]:
new_model_zips = df_with_zip_cols

In [54]:
Y = new_model_zips['saleprice']
X = new_model_zips.drop(['saleprice', 'sqftlot', 'duplex'], axis=1)
X_int = sm.add_constant(X)
model = sm.OLS(Y, X_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.716
Model:,OLS,Adj. R-squared:,0.714
Method:,Least Squares,F-statistic:,635.1
Date:,"Thu, 05 Dec 2019",Prob (F-statistic):,0.0
Time:,14:08:21,Log-Likelihood:,-309470.0
No. Observations:,22814,AIC:,619100.0
Df Residuals:,22723,BIC:,619800.0
Df Model:,90,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.347e+05,1.89e+05,3.888,0.000,3.64e+05,1.11e+06
sqfttotliving,188.8346,1.670,113.060,0.000,185.561,192.108
footprint_ratio,-1.095e+05,1.31e+04,-8.375,0.000,-1.35e+05,-8.38e+04
wfntlocation,2.118e+04,1524.659,13.895,0.000,1.82e+04,2.42e+04
lakesammamish,1.057e+05,8149.735,12.972,0.000,8.97e+04,1.22e+05
seattleskyline,3.599e+04,6980.470,5.156,0.000,2.23e+04,4.97e+04
olympics,6.576e+04,3430.496,19.169,0.000,5.9e+04,7.25e+04
mtrainier,2.287e+04,5702.765,4.011,0.000,1.17e+04,3.41e+04
streetsurface,8044.9473,7666.268,1.049,0.294,-6981.462,2.31e+04

0,1,2,3
Omnibus:,4026.043,Durbin-Watson:,1.532
Prob(Omnibus):,0.0,Jarque-Bera (JB):,96754.493
Skew:,0.1,Prob(JB):,0.0
Kurtosis:,13.087,Cond. No.,2940000.0


In [None]:
#engineer total baths
pi_1['bath_total_count']=pi_1['bathhalfcount']+pi_1['bath3qtrcount']+pi_1['bathfullcount']
pi_1.drop(columns = ['bathhalfcount','bath3qtrcount','bathfullcount'], inplace = True)

In [None]:
#engineer total porch space
pi_1['porch_sqft_total']=pi_1['sqftopenporch']+pi_1['sqftenclosedporch']
pi_1.drop(columns = ['sqftopenporch','sqftenclosedporch'], inplace = True)

In [None]:
#engineer age
pi_1['age']=2019 - pi_1['yrbuilt']
pi_1.drop(columns = ['yrbuilt'], inplace = True)

In [None]:
def make_housing_model(list_of_features, df, y):
    """
    
    """
    
    features = df[list_of_features]
    features = sm.add_constant(features)
    model = sm.OLS(y,features).fit()
    
    return model.summary() 
    

In [None]:
def check_feature_linearity(list_of_features, df, y):
    """
    """
    for column in list_of_features:
        plt.scatter(df[column],y, label=column, alpha = .05)
        plt.legend()
        plt.title(column)
        plt.show()

In [None]:


def check_feature_resid_dist(list_of_features, df, y):
    '''
    Visualizes the residiuals of a linear model in order to check the 
    assumptions. Shows both histogram of residual values and qq plot.
    
    !!!  Be sure to import scipy.stats as stats  !!!
    
    '''
    for feature in list_of_features:
        
        x = df[feature]
        x = sm.add_constant(x)
        model = sm.OLS(y,x).fit()
        pred_val = model.fittedvalues
        residuals = y.values - pred_val
        fig, ax = plt.subplots(1, 2, sharex=False, sharey=False)
        fig.set_size_inches(15,5)
        sns.distplot(residuals, ax = ax[0])
        sm.graphics.qqplot(residuals, dist=stats.norm, fit=True, line='45', ax = ax[1])
        fig.suptitle(feature)
        fig.show()
        
        

In [None]:
def check_feature_heteros(list_of_features, df, y):
    """
    Visualizes the heteroscedasticity of a linear model in order to check the 
    assumptions.
    """
    
    for feature in list_of_features:
        x = df[feature]
        x = sm.add_constant(x)
        model = sm.OLS(y,x).fit()
        fig = plt.figure(figsize=(15,8))

        fig = sm.graphics.plot_regress_exog(model, feature, fig=fig)
        plt.show()

# build a baseline model

# check baseline model assumptions

In [None]:
check_feature_linearity(baseline_features, x, y)

In [None]:
check_feature_resid_dist(baseline_features, x, y)

In [None]:
check_feature_heteros(baseline_features, x, y)

## Because The sqftlot seems to violoate assumption of linearity, lets remove it from the model

In [None]:
baseline_features = ['sqfttotliving','footprint_ratio','duplex']
make_housing_model(baseline_features, x, y)

## Include Zipcodes because they seem like a good predictor of house price

In [None]:
df_zip = df_clean[df_clean['zipcode'].str.contains ('98')]
num = '98075-8010'
df_zip['zipcode'] = df_clean['zipcode'].map(lambda x: x[0:5])

df_zip_test = df_zip[['saleprice','zipcode']]


df_zip['zipcode'] = pd.Categorical(df_zip['zipcode'])
df_zip_test = pd.get_dummies(df_zip['zipcode'], prefix = 'zip')

## What about total bathrooms?

In [None]:
#engineer total baths
x['bath_total_count']=x['bathhalfcount']+x['bath3qtrcount']+x['bathfullcount']
x.drop(columns = ['bathhalfcount','bath3qtrcount','bathfullcount'], inplace = True)

In [None]:
list_of_feat =['bath_total_count', 'sqfttotliving','footprint_ratio','duplex']
make_housing_model(list_of_feat,x,y)

In [None]:
check_feature_linearity(list_of_feat,x, y)
check_feature_resid_dist(list_of_feat,x, y)
check_feature_heteros(list_of_feat,x, y)

## What about age of house?

In [None]:
#engineer age:
x['age']=2019 - x['yrbuilt']
x.drop(columns = ['yrbuilt'], inplace = True)

In [None]:
list_of_feat =['bath_total_count','sqfttotliving','footprint_ratio','duplex', 'age']
make_housing_model(list_of_feat,x,y)

In [None]:
#age seems to add a notible boost in the model, lets test its assumptions:

In [None]:
check_feature_heteros(['age'],x,y)
check_feature_linearity(['age'],x,y)
check_feature_resid_dist(['age'],x,y)

## What about porches/decks?

In [None]:
#engineer total porch space
x['porch_sqft_total']=x['sqftopenporch']+x['sqftenclosedporch']
x.drop(columns = ['sqftopenporch','sqftenclosedporch'], inplace = True)

In [None]:
list_of_feat =['bath_total_count','sqfttotliving','footprint_ratio','duplex','age', 'porch_sqft_total']
make_housing_model(list_of_feat,x,y)

In [None]:
dropped_rows = x[x['zipcode'].str.contains ('98')]
dropped_rows['zipcode'] = dropped_rows['zipcode'].map(lambda x: x[0:5])


dropped_rows['zipcode'] = pd.Categorical(dropped_rows['zipcode'])
df_zip = pd.get_dummies(dropped_rows['zipcode'], prefix = 'zip')

df_with_zip_cols = dropped_rows.join(df_zip, how = 'inner')




In [None]:


#engineer total porch space
df_with_zip_cols['porch_sqft_total']=df_with_zip_cols['sqftopenporch']+df_with_zip_cols['sqftenclosedporch']
df_with_zip_cols.drop(columns = ['sqftopenporch','sqftenclosedporch'], inplace = True)

#engineer age:
df_with_zip_cols['age']=2019 - df_with_zip_cols['yrbuilt']
df_with_zip_cols.drop(columns = ['yrbuilt'], inplace = True)

#engineer total baths
df_with_zip_cols['bath_total_count']=df_with_zip_cols['bathhalfcount']+df_with_zip_cols['bath3qtrcount']+df_with_zip_cols['bathfullcount']
df_with_zip_cols.drop(columns = ['bathhalfcount','bath3qtrcount','bathfullcount'], inplace = True)



In [None]:
list_of_feat =['bath_total_count','sqfttotliving','footprint_ratio','duplex','age', 'porch_sqft_total']
list_of_feat.extend(df_zip.columns)

In [None]:
make_housing_model(list_of_feat,df_with_zip_cols,df_with_zip_cols['saleprice'])