In [167]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
engine = create_engine("postgresql:///kc_housing")
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [168]:
def pullsqldata():
    """This function pulls the necessary columns and rows from the PostGRES DB into a Pandas Dataframe in order 
    to continue with our EDA """
    
    engine = create_engine("postgresql:///kc_housing")
    query = """
                SELECT *
                FROM rpsale AS s
                INNER JOIN resbldg AS b ON CONCAT(s.Major,s.Minor) = CONCAT(b.Major, b.Minor)
                INNER JOIN parcel AS p ON CONCAT(s.Major,s.Minor) = CONCAT(p.Major,p.Minor)
                WHERE EXTRACT(YEAR FROM CAST(documentdate AS DATE)) = 2018
                    AND p.proptype = 'R'
                ;"""
    kc_df = pd.read_sql(sql = query, con = engine)
    return kc_df

In [169]:
df = pullsqldata()

In [4]:
def clean_data_initial(df):
    """ This function cleans the housing data by removing outliers and sale price < 10000
    """
    df_clean = df[(df['saleprice']>10000) & (df['saleprice'] <  (2*df['saleprice'].std())+df['saleprice'].mean())]
    return df_clean

In [5]:
df_clean = clean_data_initial(df)

In [125]:
df_clean_drop = df_clean.drop(['documentdate', 
                               'excisetaxnbr', 
                               'recordingnbr', 
                               'volume',
                               'page',
                               'platnbr',
                               'plattype',
                               'platlot',
                               'platblock',
'sellername',
'buyername',
'streetname',
'streettype',
'directionsuffix',
'zipcode',
'buildingnumber',
'major',
'minor',
'bldggradevar',
'sqfthalffloor',
'sqft2ndfloor',
'sqftupperfloor',
'sqftunfinfull',
'sqftunfinhalf',
'sqfttotbasement',
'sqftfinbasement',
'brickstone',
'viewutilization',
'propname',
'platname',
'platlot',
'platblock',
'range',
'township',
'section',
'quartersection',
'area',
'subarea',
'specarea',
'specsubarea',
'levycode',
'districtname',
'currentzoning',
'topography',
'currentusedesignation',
'salewarning', 
'wetland', 
'stream',
'seismichazard',
'landslidehazard',
'address', 
'airportnoise',
'contamination',
'dnrlease',
 'coalminehazard',
 'criticaldrainage',
 'erosionhazard',
 'landfillbuffer',
 'hundredyrfloodplain',
 'steepslopehazard',
 'speciesofconcern',
 'sensitiveareatract',
 'daylightbasement',
 'fraction',
'directionprefix', 'proptype','unbuildable'], axis=1)

In [None]:
# strips whitespace from othernuisances column, then converts Y to 1 and N to 0
df_clean['othernuisances'] = [x.strip() for x in df_clean['othernuisances']]
df_clean.replace(('Y', 'N'), (1, 0), inplace=True)


In [77]:
Y = df_clean['saleprice']
X = df_clean.drop(['saleprice'], axis=1)


In [9]:
lr = LinearRegression()

In [107]:
# worked to strip whitespace
# X['othernuisances'] = [x.strip() for x in X['othernuisances']]

# I dont think the below worked
# pd.Series(map(lambda x: dict(Y=1, N=0)[x],
#               X.othernuisances.values.tolist()), X.index)


In [117]:
select = RFE(lr, n_features_to_select=50)
select = select.fit(X, y= Y.values.ravel())
selected_columns = X.columns[select.support_]

In [119]:
selected_columns

Index(['principaluse', 'saleinstrument', 'afforestland', 'afcurrentuseland',
       'afnonprofituse', 'afhistoricproperty', 'bldgnbr', 'nbrlivingunits',
       'stories', 'bldggrade', 'finbasementgrade', 'heatsource',
       'bathhalfcount', 'bath3qtrcount', 'bathfullcount', 'fpsinglestory',
       'fpmultistory', 'pcntnetcondition', 'hbuasifvacant', 'hbuasimproved',
       'watersystem', 'sewersystem', 'restrictiveszshape', 'inadequateparking',
       'mtrainier', 'olympics', 'territorial', 'seattleskyline', 'pugetsound',
       'lakewashington', 'lakesammamish', 'smalllakerivercreek', 'wfntbank',
       'wfntrestrictedaccess', 'wfntaccessrights', 'wfntproximityinfluence',
       'powerlines', 'othernuisances', 'nbrbldgsites', 'adjacentgolffairway',
       'adjacentgreenbelt', 'historicsite', 'nativegrowthprotesmt',
       'easements', 'otherdesignation', 'deedrestrictions',
       'developmentrightspurch', 'waterproblems', 'transpconcurrency',
       'otherproblems'],
      dtype='ob

In [134]:
X_int = sm.add_constant(X)
model = sm.OLS(Y, X_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.514
Model:,OLS,Adj. R-squared:,0.513
Method:,Least Squares,F-statistic:,365.4
Date:,"Tue, 03 Dec 2019",Prob (F-statistic):,0.0
Time:,16:39:31,Log-Likelihood:,-398760.0
No. Observations:,28747,AIC:,797700.0
Df Residuals:,28663,BIC:,798400.0
Df Model:,83,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.417e+06,2.15e+05,25.168,0.000,4.99e+06,5.84e+06
propertytype,-1851.1482,233.065,-7.943,0.000,-2307.966,-1394.330
principaluse,-1.035e+05,2.01e+04,-5.158,0.000,-1.43e+05,-6.42e+04
saleinstrument,-1.198e+04,470.436,-25.474,0.000,-1.29e+04,-1.11e+04
afforestland,4.827e+04,9.78e+04,0.493,0.622,-1.44e+05,2.4e+05
afcurrentuseland,-7.967e+04,5.74e+04,-1.389,0.165,-1.92e+05,3.28e+04
afnonprofituse,-1.893e+05,1.48e+05,-1.277,0.201,-4.8e+05,1.01e+05
afhistoricproperty,-1.408e+05,2.56e+05,-0.549,0.583,-6.43e+05,3.62e+05
salereason,-4014.9325,440.683,-9.111,0.000,-4878.693,-3151.172

0,1,2,3
Omnibus:,3541.003,Durbin-Watson:,1.108
Prob(Omnibus):,0.0,Jarque-Bera (JB):,12954.898
Skew:,0.598,Prob(JB):,0.0
Kurtosis:,6.064,Cond. No.,10200000.0


In [166]:
# sns.heatmap(X.corr())
# X.corr() > .75
## looks for things with a correlation of > 0.5
corr = X.corr() > .55
corr_list = []
for col in corr.columns:
    if corr[col].sum() > 1:
        print(col)
        corr_list.append(col)

bldggrade
sqft1stfloor
sqfttotliving
bedrooms
bathfullcount
fpmultistory
fpadditional
olympics
pugetsound
smalllakerivercreek
wfntlocation
wfntfootage
wfntbank
wfntrestrictedaccess
tidelandshoreland


In [160]:
corr_list

['bldggrade',
 'sqft1stfloor',
 'sqfttotliving',
 'bedrooms',
 'bathfullcount',
 'fpmultistory',
 'fpadditional',
 'yrbuilt',
 'olympics',
 'territorial',
 'pugetsound',
 'smalllakerivercreek',
 'wfntlocation',
 'wfntfootage',
 'wfntbank',
 'wfntrestrictedaccess',
 'tidelandshoreland']

In [165]:
# select = RFE(lr, n_features_to_select=50)
# select = select.fit(X, y= Y.values.ravel())
# selected_columns = X.columns[select.support_]

X_corr_list = X[corr_list]

X_int = sm.add_constant(X_corr_list)
model = sm.OLS(Y, X_int).fit()
model.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.002
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,24.11
Date:,"Tue, 03 Dec 2019",Prob (F-statistic):,3.44e-11
Time:,16:52:31,Log-Likelihood:,-409110.0
No. Observations:,28747,AIC:,818200.0
Df Residuals:,28744,BIC:,818300.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,7.147e+05,2177.072,328.268,0.000,7.1e+05,7.19e+05
wfntlocation,-8907.6095,4155.786,-2.143,0.032,-1.71e+04,-762.076
wfntbank,1.01e+05,1.72e+04,5.885,0.000,6.74e+04,1.35e+05

0,1,2,3
Omnibus:,4214.19,Durbin-Watson:,0.83
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6504.918
Skew:,1.045,Prob(JB):,0.0
Kurtosis:,4.03,Cond. No.,8.17
