# IMPORT PACKAGES AND DATA

In [158]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [159]:
trn = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# VIEW MISSINGNESS

In [160]:
#missing data
total = trn.isnull().sum().sort_values(ascending=False)
percent = (trn.isnull().sum()/trn.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data[missing_data.Total>0]

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


# DATA CLEANING

In [163]:
# remove Utilities
trn = trn.drop(['Utilities'],axis=1)
test = test.drop(['Utilities'],axis=1)

In [164]:
# remove poolQC
trn = trn.drop(['PoolQC'],axis=1)
test = test.drop(['PoolQC'],axis=1)

In [165]:
# remove miscfeature - MiscVal expresses it better
trn = trn.drop(['MiscFeature'],axis=1)
test = test.drop(['MiscFeature'],axis=1)

In [166]:
# convert alley NA to seperate category
trn.loc[trn.Alley.isnull(),'Alley']='None'
test.loc[test.Alley.isnull(),'Alley']='None'

In [167]:
# remove fence
trn = trn.drop(['Fence'],axis=1)
test = test.drop(['Fence'],axis=1)

In [168]:
# heatingQC
trn = trn.replace({"HeatingQC" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"HeatingQC" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})

In [169]:
# exterqual and extercond
trn = trn.replace({"ExterQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"ExterQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})

trn = trn.replace({"ExterCond" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"ExterCond" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})

In [170]:
# fireplace quality - ordinal categorical with NAs = 0
trn = trn.replace({"FireplaceQu" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"FireplaceQu" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
trn['FireplaceQu'] = trn['FireplaceQu'].fillna(0)
test['FireplaceQu'] = test['FireplaceQu'].fillna(0)

In [171]:
# kitchen qual
trn = trn.replace({"KitchenQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"KitchenQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
trn['KitchenQual'] = trn['KitchenQual'].fillna(0)
test['KitchenQual'] = test['KitchenQual'].fillna(0)

In [172]:
# lot frontage impute 0s
trn['LotFrontage'] = trn['LotFrontage'].fillna(0)
test['LotFrontage'] = test['LotFrontage'].fillna(0)

In [173]:
# convert the two integer-type categories to categories
trn = trn.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45",
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75",
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120",
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

test = test.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45",
                                       50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75",
                                       80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120",
                                       150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                       "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                   7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                      })

In [174]:
# Garage variables

trn.loc[trn.GarageType.isnull(),'GarageType']='None'
test.loc[test.GarageType.isnull(),'GarageType']='None'
#change GarageFinish to be ordinal with NAs = 0
trn = trn.replace({"GarageFinish" : {"Fin" : 3, "RFn" : 2, "Unf" : 1}})
test = test.replace({"GarageFinish" : {"Fin" : 3, "RFn" : 2, "Unf" : 1}})
trn['GarageFinish'] = trn['GarageFinish'].fillna(0)
test['GarageFinish'] = test['GarageFinish'].fillna(0)
# change GarageCond to be ordinal with NAs = 0
trn = trn.replace({"GarageCond" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"GarageCond" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
trn['GarageCond'] = trn['GarageCond'].fillna(0)
test['GarageCond'] = test['GarageCond'].fillna(0)
# change GarageQual to be ordinal with NAs = 0
trn = trn.replace({"GarageQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"GarageQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
trn['GarageQual'] = trn['GarageQual'].fillna(0)
test['GarageQual'] = test['GarageQual'].fillna(0)
# drop garage year built - assume it is reflected in GarageQual
trn = trn.drop(['GarageYrBlt'],axis=1)
test = test.drop(['GarageYrBlt'],axis=1)


In [175]:
# Basement variables:

# DROP BsmtFinType(s) because I can't figure out how to deal with them
trn = trn.drop(['BsmtFinType1'],axis=1)
test = test.drop(['BsmtFinType1'],axis=1)
trn = trn.drop(['BsmtFinType2'],axis=1)
test = test.drop(['BsmtFinType2'],axis=1)
# BsmtQual ord
trn = trn.replace({"BsmtQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"BsmtQual" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
trn['BsmtQual'] = trn['BsmtQual'].fillna(0)
test['BsmtQual'] = test['BsmtQual'].fillna(0)
# BsmtCond 
trn = trn.replace({"BsmtCond" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
test = test.replace({"BsmtCond" : {"Ex" : 5, "Gd" : 4, "TA" : 3, "Fa" : 2, "Po" : 1}})
trn['BsmtCond'] = trn['BsmtCond'].fillna(0)
test['BsmtCond'] = test['BsmtCond'].fillna(0)
# BsmtExposure
trn = trn.replace({"BsmtExposure" : {"Gd" : 4, "Av" : 3, "Mn" : 2, "No" : 1}})
test = test.replace({"BsmtExposure" : {"Gd" : 4, "Av" : 3, "Mn" : 2, "No" : 1}})
trn['BsmtExposure'] = trn['BsmtExposure'].fillna(0)
test['BsmtExposure'] = test['BsmtExposure'].fillna(0)

In [176]:
# Mason variables:

trn.loc[trn.MasVnrType.isnull(),'MasVnrType']='None'
test.loc[test.MasVnrType.isnull(),'MasVnrType']='None'
trn.loc[trn.MasVnrArea.isnull(),'MasVnrArea']=0
test.loc[test.MasVnrArea.isnull(),'MasVnrArea']=0

In [177]:
# Electrical - impute 1 NA to be most commonly represented category (by comfortable margin)
trn.loc[trn.Electrical.isnull(),'Electrical']='SBrkr'

# MISSING ONLY IN TEST SET

In [180]:
total = test.isnull().sum().sort_values(ascending=False)
missing_data = pd.concat([total], axis=1, keys=['Total'])
missing_data[missing_data.Total>0]

Unnamed: 0,Total
MSZoning,4
Functional,2
BsmtFullBath,2
BsmtHalfBath,2
GarageArea,1
BsmtFinSF2,1
BsmtUnfSF,1
TotalBsmtSF,1
SaleType,1
Exterior2nd,1


In [181]:
# ASSUME MSZoning is equal to RL
test.loc[test.MSZoning.isnull(),'MSZoning']='RL'

In [182]:
# ASSUME Functional is Typ
test.loc[test.Functional.isnull(), 'Functional']='Typ'

In [183]:
# ASSUME null means 0 bsmt baths
test.loc[test.BsmtFullBath.isnull(), 'BsmtFullBath']=0.0
test.loc[test.BsmtHalfBath.isnull(), 'BsmtHalfBath']=0.0

In [184]:
# find that bsmt sf should be 0
test.loc[test.TotalBsmtSF.isnull(),'BsmtQual']
test.loc[test.TotalBsmtSF.isnull(),'TotalBsmtSF']=0

test.loc[test.BsmtFinSF2.isnull(),'BsmtQual']
test.loc[test.BsmtFinSF2.isnull(),'BsmtFinSF2']=0

test.loc[test.BsmtFinSF1.isnull(),'BsmtQual']
test.loc[test.BsmtFinSF1.isnull(),'BsmtFinSF1']=0

test.loc[test.BsmtUnfSF.isnull(),'BsmtQual']
test.loc[test.BsmtUnfSF.isnull(),'BsmtUnfSF']=0

In [185]:
# ASSUME SaleType is most common
test.loc[test.SaleType.isnull(),'SaleType']='WD'

In [186]:
# find that garage cars/area should be 0
test.loc[test.GarageCars.isnull(),'GarageCars']=0
test.loc[test.GarageArea.isnull(),'GarageArea']=0

In [187]:
# ASSUME exteriors are most common
test.Exterior2nd.value_counts()
test.loc[test.Exterior1st.isnull(),'Exterior1st']='VinylSd'
test.loc[test.Exterior2nd.isnull(),'Exterior2nd']='VinylSd'

# FEATURE ENGINEERING

In [190]:
# adding Total Square Foot variable
trn['TotalSF'] = trn['TotalBsmtSF'] + trn['1stFlrSF'] + trn['2ndFlrSF']
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

In [193]:
# create dummies
trn = pd.get_dummies(trn)
test = pd.get_dummies(test)

In [211]:
# remove the dummy variables from trn that are not represented in test (14), and vice versa (1)
dropvars1 = list(set(trn.columns)-set(test.columns)-set(['SalePrice']))
dropvars2 = list(set(test.columns)-set(trn.columns))
trn = trn.drop(dropvars1,axis=1)
test = test.drop(dropvars2,axis=1)

255
241
14


# WRITE TO NEW FILES

In [214]:
trn.to_csv('cleantrain.csv')
test.to_csv('cleantest.csv')