In [25]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
engine = create_engine("postgresql:///kc_housing")
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

In [2]:
def pullsqldata():
    """This function pulls the necessary columns and rows from the PostGRES DB into a Pandas Dataframe in order 
    to continue with our EDA """
    
    engine = create_engine("postgresql:///kc_housing")
    query = """
                SELECT *
                FROM rpsale AS s
                INNER JOIN resbldg AS b ON CONCAT(s.Major,s.Minor) = CONCAT(b.Major, b.Minor)
                INNER JOIN parcel AS p ON CONCAT(s.Major,s.Minor) = CONCAT(p.Major,p.Minor)
                WHERE EXTRACT(YEAR FROM CAST(documentdate AS DATE)) = 2018
                    AND p.proptype = 'R'
                ;"""
    kc_df = pd.read_sql(sql = query, con = engine)
    return kc_df

In [3]:
def clean_data_initial(df):
    """ This function cleans the housing data by removing outliers and sale price < 10000
    """
    df_clean = df[(df['saleprice']>10000) & (df['saleprice'] <  (2*df['saleprice'].std())+df['saleprice'].mean())]
    return df_clean

In [4]:
df = pullsqldata()
df = clean_data_initial(df)

In [5]:
df_clean = df.drop(['documentdate', 
                               'excisetaxnbr', 
                               'recordingnbr', 
                               'volume',
                               'page',
                               'platnbr',
                               'plattype',
                               'platlot',
                               'platblock',
'sellername',
'buyername',
'streetname',
'streettype',
'directionsuffix',
'zipcode',
'buildingnumber',
'major',
'minor',
'bldggradevar',
'sqfthalffloor',
'sqft2ndfloor',
'sqftupperfloor',
'sqftunfinfull',
'sqftunfinhalf',
'sqfttotbasement',
'sqftfinbasement',
'brickstone',
'viewutilization',
'propname',
'platname',
'platlot',
'platblock',
'range',
'township',
'section',
                               
'quartersection',
'area',
'subarea',
'specarea',
'specsubarea',
'levycode',
'districtname',
'currentzoning',
'topography',
'currentusedesignation',
'salewarning', 
'wetland', 
'stream',
'seismichazard',
'landslidehazard',
'address', 
'airportnoise',
'contamination',
'dnrlease',
 'coalminehazard',
 'criticaldrainage',
 'erosionhazard',
 'landfillbuffer',
 'hundredyrfloodplain',
 'steepslopehazard',
 'speciesofconcern',
 'sensitiveareatract',
 'daylightbasement',
 'fraction',
'directionprefix', 'proptype','unbuildable'], axis=1)

In [6]:
df_clean['othernuisances'] = [x.strip() for x in df_clean['othernuisances']]
df_clean.replace(('Y', 'N'), (1, 0), inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [7]:
df_clean['logprice'] = np.log(df_clean['saleprice'])

Y = df_clean['logprice']
X = df_clean.drop(['logprice','saleprice'], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
mlr = LinearRegression()

In [10]:
select = RFE(mlr, n_features_to_select=50)
select = select.fit(X, y= Y.values.ravel())
selected_columns = X.columns[select.support_]

In [11]:
X_int = sm.add_constant(X)
model = sm.OLS(Y, X_int).fit()
model.summary()


  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,logprice,R-squared:,0.458
Model:,OLS,Adj. R-squared:,0.456
Method:,Least Squares,F-statistic:,291.3
Date:,"Wed, 04 Dec 2019",Prob (F-statistic):,0.0
Time:,07:56:56,Log-Likelihood:,-15019.0
No. Observations:,28747,AIC:,30210.0
Df Residuals:,28663,BIC:,30900.0
Df Model:,83,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,18.2833,0.343,53.278,0.000,17.611,18.956
propertytype,-0.0037,0.000,-9.935,0.000,-0.004,-0.003
principaluse,-0.1208,0.032,-3.775,0.000,-0.184,-0.058
saleinstrument,-0.0300,0.001,-39.954,0.000,-0.031,-0.029
afforestland,-0.0114,0.156,-0.073,0.942,-0.317,0.294
afcurrentuseland,0.0246,0.091,0.269,0.788,-0.155,0.204
afnonprofituse,-0.2032,0.236,-0.860,0.390,-0.666,0.260
afhistoricproperty,-0.3174,0.409,-0.777,0.437,-1.119,0.484
salereason,-0.0147,0.001,-20.864,0.000,-0.016,-0.013

0,1,2,3
Omnibus:,9173.412,Durbin-Watson:,1.208
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98901.743
Skew:,-1.222,Prob(JB):,0.0
Kurtosis:,11.752,Cond. No.,10200000.0


already deleted


In [17]:
#this will return a list of columns which correlate to something other than themselves, saleprice can be ignored

corr = df_clean.corr() > .50
corr_list = []
for col in corr.columns:
    if corr[col].sum() > 1:
        print(col)
        corr_list.append(col)

saleprice
bldggrade
sqft1stfloor
sqfttotliving
bedrooms
bathfullcount
fpmultistory
fpadditional
yrbuilt
olympics
territorial
pugetsound
smalllakerivercreek
wfntlocation
wfntfootage
wfntbank
wfntrestrictedaccess
tidelandshoreland


In [35]:

print("how dropping each element would affect the r_squared values" )
for element in corr_list:
    Y1 = df_clean['saleprice']
    X1 = df_clean.drop(['saleprice', element], axis=1)
    reg = LinearRegression().fit(X1, Y1)
    print(f"dropped {element}:", reg.score(X1, Y1))



how dropping each element would affect the r_squared values
dropped saleprice: 0.514141153463175


Unnamed: 0,propertytype,principaluse,saleinstrument,afforestland,afcurrentuseland,afnonprofituse,afhistoricproperty,salereason,propertyclass,bldgnbr,...,adjacentgreenbelt,historicsite,nativegrowthprotesmt,easements,otherdesignation,deedrestrictions,developmentrightspurch,waterproblems,transpconcurrency,otherproblems
0,3.0,6.0,3.0,0,0,0,0,1.0,8.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
3,3.0,6.0,3.0,0,0,0,0,18.0,8.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
6,11.0,6.0,3.0,0,0,0,0,1.0,8.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
7,11.0,6.0,3.0,0,0,0,0,1.0,8.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
8,1.0,6.0,3.0,0,0,0,0,1.0,7.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43732,3.0,6.0,3.0,0,0,0,0,1.0,8.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
43735,12.0,6.0,3.0,0,0,0,0,1.0,8.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0
43736,12.0,6.0,3.0,0,0,0,0,1.0,8.0,2.0,...,0,0.0,0,0,0,0,0,0,0,0
43737,2.0,6.0,3.0,0,0,0,0,1.0,7.0,1.0,...,0,0.0,0,0,0,0,0,0,0,0


In [79]:
# determine highest correlation factor
from scipy.stats import pearsonr



[310000.0,
 500000.0,
 456800.0,
 461000.0,
 80000.0,
 795000.0,
 440000.0,
 520000.0,
 166166.0,
 630000.0,
 97500.0,
 300000.0,
 599000.0,
 345000.0,
 355000.0,
 393500.0,
 817500.0,
 1115000.0,
 679000.0,
 705000.0,
 669000.0,
 680000.0,
 722100.0,
 785000.0,
 835000.0,
 730000.0,
 1200000.0,
 1120000.0,
 1064000.0,
 1191000.0,
 1148500.0,
 1183000.0,
 950000.0,
 1190000.0,
 409950.0,
 440000.0,
 455000.0,
 365000.0,
 370000.0,
 306000.0,
 302000.0,
 395000.0,
 259000.0,
 171000.0,
 855360.0,
 860000.0,
 833000.0,
 747000.0,
 800000.0,
 772000.0,
 815000.0,
 911000.0,
 1019000.0,
 1208000.0,
 1200000.0,
 1000000.0,
 755000.0,
 453500.0,
 767000.0,
 499000.0,
 605000.0,
 495500.0,
 490000.0,
 155000.0,
 455000.0,
 360000.0,
 173000.0,
 385000.0,
 286250.0,
 290000.0,
 470000.0,
 492000.0,
 300000.0,
 440000.0,
 490000.0,
 988800.0,
 920000.0,
 588000.0,
 1714340.0,
 850000.0,
 772000.0,
 625000.0,
 659500.0,
 520000.0,
 865000.0,
 710000.0,
 365000.0,
 649950.0,
 599950.0,
 699950.0,

In [30]:



def find_highest_correlation(df, dep ='saleprice'):
    try:
        X = df.drop([dep], axis = 1)
    except:
        X = df
    Y = df[dep]
    
    num = 0
    for element in X.columns:
        a = pearsonr(list(X[element]),list(Y))
        if np.abs(a[0]) > num:
            num = a[0]
    for element in X.columns:
        a = pearsonr(list(X[element]),list(Y))
        if np.abs(a[0]) == num:
            return element
        


        

In [29]:
find_highest_correlation(df_clean)

'bldggrade'

In [84]:
def check_log_corr(df, dependent = 'saleprice'):
    """this function will take in a dataframe and return a list of independent variables where the log of the function
    returns a higher r_squared value than the function itself"""
    dep = df[dependent]
    Xreg = df.drop(dependent, axis = 1)
    log_list = []
    for col in Xreg.columns:
        if all(value > 1 for value in df[col]):
            print(col)
#             X = np.array(df[element]).reshape(-1,1)      
#             reg1 = LinearRegression().fit(X, dep)
            
#             Xlog = np.log(X)
#             reg2 = LinearRegression().fit(Xlog, dep)
#             print(reg1, reg2)
            
        
df_clean['saleprice']
check_log_corr(df_clean)


principaluse
bldggrade
sqfttotliving
yrbuilt
presentuse
sqftlot


In [87]:
df_logtester = df_clean[['saleprice', 'sqftlot', 'sqfttotliving', 'bldggrade']]
df_logtester['loglot'] = np.log(df['sqftlot'])
df_logtester['loglot'] = np.log(df['sqfttotliving'])

Y = df_logtester['saleprice']

reg = LinearRegression().fit(X1, Y)
r_squared_list.append((reg.score(X1, Y1), element))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [61]:
x1 = find_highest_correlation(df_clean)

r_squared_list = []
for element in df_clean:
    X1 = np.array(df_clean[element]).reshape(-1,1)      
    reg = LinearRegression().fit(X1, Y1)
    r_squared_list.append((reg.score(X1, Y1), element))


In [88]:
r_squared_list.sort()
r_sorted = r_squared_list[::-1]
r_sorted

[(1.0, 'saleprice'),
 (0.34215893473348286, 'bldggrade'),
 (0.2943929991946812, 'sqfttotliving'),
 (0.09513145060772099, 'stories'),
 (0.08804165718586376, 'bathfullcount'),
 (0.07465208896328733, 'bedrooms'),
 (0.07325784430673132, 'sqft1stfloor'),
 (0.06048072202771837, 'sqftopenporch'),
 (0.046115911064835036, 'territorial'),
 (0.04356462197492705, 'finbasementgrade'),
 (0.03838073310780421, 'bath3qtrcount'),
 (0.03046862813178075, 'heatsystem'),
 (0.027362973055430336, 'bathhalfcount'),
 (0.026011840734576186, 'fpmultistory'),
 (0.021603966125533502, 'olympics'),
 (0.021467886821468607, 'lakewashington'),
 (0.01966010592978429, 'sqftgarageattached'),
 (0.019394854556952112, 'saleinstrument'),
 (0.018273238424683536, 'addnlcost'),
 (0.017576911209486235, 'yrbuilt'),
 (0.014650397458494169, 'fpsinglestory'),
 (0.013476416368119581, 'pugetsound'),
 (0.011268307432011238, 'salereason'),
 (0.01118679181430493, 'lakesammamish'),
 (0.010079084772140257, 'seattleskyline'),
 (0.007180968850

In [74]:
y1 = 'saleprice'
x1 = 'bldggrade'

run_list = r_sorted[2:]

for item in run_list[1]:
    
    

[(0.2943929991946812, 'sqfttotliving'),
 (0.09513145060772099, 'stories'),
 (0.08804165718586376, 'bathfullcount'),
 (0.07465208896328733, 'bedrooms'),
 (0.07325784430673132, 'sqft1stfloor'),
 (0.06048072202771837, 'sqftopenporch'),
 (0.046115911064835036, 'territorial'),
 (0.04356462197492705, 'finbasementgrade'),
 (0.03838073310780421, 'bath3qtrcount'),
 (0.03046862813178075, 'heatsystem'),
 (0.027362973055430336, 'bathhalfcount'),
 (0.026011840734576186, 'fpmultistory'),
 (0.021603966125533502, 'olympics'),
 (0.021467886821468607, 'lakewashington'),
 (0.01966010592978429, 'sqftgarageattached'),
 (0.019394854556952112, 'saleinstrument'),
 (0.018273238424683536, 'addnlcost'),
 (0.017576911209486235, 'yrbuilt'),
 (0.014650397458494169, 'fpsinglestory'),
 (0.013476416368119581, 'pugetsound'),
 (0.011268307432011238, 'salereason'),
 (0.01118679181430493, 'lakesammamish'),
 (0.010079084772140257, 'seattleskyline'),
 (0.007180968850249547, 'adjacentgreenbelt'),
 (0.0070250580191529854, 'ca