In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from pylab import *
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

path_to_train_data = "./train.csv"

In [4]:
df = pd.read_csv(path_to_train_data)

print('Number of houses:', format(df.shape[0]))
print ('Number of features:', format(df.shape[1]-2))



Number of houses: 1460
Number of features: 79


In [5]:
for i in df.columns:
    if df.dtypes[i] == int64:
        pass
    elif df.dtypes[i] == float64:
        pass
    else:
        print(i)

MSZoning
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Heating
HeatingQC
CentralAir
Electrical
KitchenQual
Functional
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PavedDrive
PoolQC
Fence
MiscFeature
SaleType
SaleCondition


In [6]:
df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
df.Alley.fillna("None",inplace=True)

In [7]:
df.MasVnrType.fillna("None",inplace=True)
df.MasVnrArea.fillna(0.0, inplace=True)

In [8]:
df.BsmtQual.fillna("None",inplace=True)
df.BsmtCond.fillna("None",inplace=True)
df.BsmtExposure.fillna("None",inplace=True)

In [9]:
df.BsmtFinType1.fillna("None",inplace=True)
df.BsmtFinType2.fillna("None",inplace=True)

In [10]:
df.Electrical.fillna("SBrkr",inplace=True)

In [11]:
df.FireplaceQu.fillna("None",inplace=True)

In [12]:
df.GarageType.fillna("None",inplace=True)
df.GarageYrBlt.fillna("None",inplace=True)
df.GarageFinish.fillna("None",inplace=True)
df.GarageQual.fillna("None",inplace=True)
df.GarageCond.fillna("None",inplace=True)

In [13]:
df.PoolQC.fillna("None",inplace=True)

In [14]:
df.Fence.fillna("None",inplace=True)

In [15]:
df.MiscFeature.fillna("None",inplace=True)

In [16]:
null_cols = df.columns[df.isnull().any()]
print(null_cols)
df[null_cols].isnull().sum()

Index([], dtype='object')


Series([], dtype: float64)

In [17]:
quality_dictionary = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['ExterQual']   = df['ExterQual'].map(quality_dictionary).astype(int)
df['ExterCond']   = df['ExterCond'].map(quality_dictionary).astype(int)
df['BsmtQual']    = df['BsmtQual'].map(quality_dictionary).astype(int)
df['BsmtCond']    = df['BsmtCond'].map(quality_dictionary).astype(int)
df['HeatingQC']   = df['HeatingQC'].map(quality_dictionary).astype(int)
df['KitchenQual'] = df['KitchenQual'].map(quality_dictionary).astype(int)
df['FireplaceQu'] = df['FireplaceQu'].map(quality_dictionary).astype(int)
df['GarageQual']  = df['GarageQual'].map(quality_dictionary).astype(int)
df['GarageCond']  = df['GarageCond'].map(quality_dictionary).astype(int)
df['PoolQC']      = df['PoolQC'].map(quality_dictionary).astype(int)

#saving memory
del quality_dictionary

In [18]:
df['BsmtExposure'] = df['BsmtExposure'].map({'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}).astype(int)

bsmtdict = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmtdict).astype(int)
df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmtdict).astype(int)

df['Functional'] = df['Functional'].map({'None': 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
                                         'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}).astype(int)

df['GarageFinish'] = df['GarageFinish'].map({'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}).astype(int)
df['Fence'] = df['Fence'].map({'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}).astype(int)

del bsmtdict

In [19]:
dfcorr = df.corr()
dfcorr = dfcorr['SalePrice']

def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df, n=5):
    au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(df.corr(), 20))

#print(dfcorr)
#df.corr()

Top Absolute Correlations
GarageQual    GarageCond      0.998768
PoolArea      PoolQC          0.994259
GarageCars    GarageArea      0.988244
ExterQual     KitchenQual     0.970237
Fireplaces    FireplaceQu     0.967941
OverallQual   SalePrice       0.967327
              ExterQual       0.964200
BsmtFinType2  BsmtFinSF2      0.956578
GrLivArea     TotRmsAbvGrd    0.947504
OverallQual   KitchenQual     0.947185
TotalBsmtSF   1stFlrSF        0.939843
ExterQual     SalePrice       0.924657
YearRemodAdd  KitchenQual     0.924206
KitchenQual   SalePrice       0.915048
GarageFinish  GarageCars      0.913889
YearRemodAdd  ExterQual       0.912896
YearBuilt     GarageFinish    0.909283
YearRemodAdd  HeatingQC       0.908748
YearBuilt     ExterQual       0.906710
BsmtFinType1  BsmtFinSF1      0.904124
dtype: float64


# Decision Tree Model

Feature Engineering and Baseline

In [20]:
print ('Neighborhood:    {0}'.format(df['Neighborhood'].unique()))
print('Number of Neighborhoods:    {0}'.format(len(df['Neighborhood'].unique())))

Neighborhood:    ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Number of Neighborhoods:    25


In [21]:
df.groupby('Neighborhood').count();
df['Neighborhood'].value_counts()

NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
Blmngtn     17
MeadowV     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64

In [22]:
colso2 = pd.crosstab(index=df["Neighborhood"], 
                           columns=df['MasVnrArea'])
colso2.index= ['CollgCr','Veenker','Crawfor', 'NoRidge', 'Mitchel', 'Somerst', 
               'NWAmes','OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes', 'SawyerW', 'IDOTRR',
               'MeadowV', 'Edwards', 'Timber', 'Gilbert', 'StoneBr', 'ClearCr', 'NPkVill',
               'Blmngtn', 'BrDale', 'SWISU', 'Blueste']

colso2

MasVnrArea,0.0,1.0,11.0,14.0,16.0,18.0,22.0,24.0,27.0,28.0,...,921.0,922.0,975.0,1031.0,1047.0,1115.0,1129.0,1170.0,1378.0,1600.0
CollgCr,1,0,1,1,5,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Veenker,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Crawfor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NoRidge,55,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Mitchel,20,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Somerst,75,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
NWAmes,37,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OldTown,75,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BrkSide,58,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Sawyer,36,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Building the Variables and Training the Model

In [23]:
df2 = df.select_dtypes(include=['float64'])

In [24]:
df_sex = pd.get_dummies(df['Neighborhood'])

In [25]:
df2 = pd.concat([df2, df_sex], axis=1)

In [26]:
df2;

In [27]:
print("Top Absolute Correlations")
print(get_top_abs_correlations(df2.corr(), 50))

Top Absolute Correlations
MasVnrArea   NridgHt       0.509055
             NoRidge       0.504374
LotFrontage  BrDale        0.397918
             MeadowV       0.343165
             MasVnrArea    0.342901
             NoRidge       0.264936
             NridgHt       0.255192
MasVnrArea   OldTown       0.245603
LotFrontage  NPkVill       0.209757
             NAmes         0.202739
MasVnrArea   BrkSide       0.200142
LotFrontage  BrkSide       0.197008
             Blmngtn       0.183339
             OldTown       0.174249
MasVnrArea   NWAmes        0.173188
CollgCr      NAmes         0.165788
LotFrontage  Blueste       0.162201
MasVnrArea   BrDale        0.155218
NAmes        OldTown       0.150621
MasVnrArea   IDOTRR        0.150315
             MeadowV       0.141637
             SWISU         0.140267
LotFrontage  NWAmes        0.136275
MasVnrArea   Sawyer        0.132716
Edwards      NAmes         0.131450
MasVnrArea   Edwards       0.131425
             Gilbert       0.130771
Lo

In [28]:
# target is stored in y
Y = df['Neighborhood']

# X contains all other features, which we will use to predict target
X = df2

## Split 1

In [None]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=618)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=25)
dTree.fit(X_train, Y_train);

In [None]:
dTree_pred = dTree.predict(X_test)
dTree_pred;

In [None]:
score = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
score

In [None]:
baslline = np.full((438,1),'NAmes')

In [None]:
base = accuracy_score(y_true = Y_test, y_pred = baslline)
base

In [None]:
(score/base)-1

## Split 2

In [None]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=658)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=25)
dTree.fit(X_train, Y_train);

In [None]:
dTree_pred = dTree.predict(X_test)
dTree_pred;

In [None]:
score = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
score

In [None]:
baslline = np.full((438,1),'NAmes')

In [None]:
base = accuracy_score(y_true = Y_test, y_pred = baslline)
base

In [None]:
(score/base)-1

## Split 3

In [None]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=398)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=25)
dTree.fit(X_train, Y_train);

In [None]:
dTree_pred = dTree.predict(X_test)
dTree_pred;

In [None]:
score = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
score

In [None]:
baslline = np.full((438,1),'NAmes')

In [None]:
base = accuracy_score(y_true = Y_test, y_pred = baslline)
base

In [None]:
(score/base)-1

## Split 4

In [None]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=661)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=25)
dTree.fit(X_train, Y_train);

In [None]:
dTree_pred = dTree.predict(X_test)
dTree_pred;

In [None]:
score = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
score

In [None]:
baslline = np.full((438,1),'NAmes')

In [None]:
base = accuracy_score(y_true = Y_test, y_pred = baslline)
base

In [None]:
(score/base)-1

## Split 5

In [None]:
# train/test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=9)

# build model and fit on train set
dTree = DecisionTreeClassifier(max_leaf_nodes=25)
dTree.fit(X_train, Y_train);

In [None]:
dTree_pred = dTree.predict(X_test)
dTree_pred;

In [None]:
score = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
score

In [None]:
baslline = np.full((438,1),'NAmes')

In [None]:
base = accuracy_score(y_true = Y_test, y_pred = baslline)
base

In [None]:
(score/base)-1

## Hyperparameter Tuning through For-Loop

In [None]:
accval = []
for i in range(1000):
    # train/test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

    # build model and fit on train set
    dTree = DecisionTreeClassifier(max_leaf_nodes=25)
    dTree.fit(X_train, Y_train)
    dTree_pred = dTree.predict(X_test)
    a = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
    b = accuracy_score(y_true = Y_test, y_pred = baslline)
    accval.append((a/b)-1)
    
print('Maximum Accuracy above Baseline: {:.2f}%'.format(100*max(accval)))
print('Minimum Accuracy above Baseline: {:.2f}%'.format(100*min(accval)))

In [None]:
accval.index(max(accval))
sorted(range(len(accval)), key=lambda x: accval[x])[-5:]

In [None]:
accval = []
for i in range(999):
    # train/test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=9)

    # build model and fit on train set
    dTree = DecisionTreeClassifier(max_leaf_nodes=i+2)
    dTree.fit(X_train, Y_train)
    dTree_pred = dTree.predict(X_test)
    a = accuracy_score(y_true = Y_test, y_pred = dTree_pred)
    b = accuracy_score(y_true = Y_test, y_pred = baslline)
    accval.append((a/b)-1)
    
print('Maximum Accuracy above Baseline: {:.2f}%'.format(100*max(accval)))
print('Minimum Accuracy above Baseline: {:.2f}%'.format(100*min(accval)))
#Print which i Value actually gives the best model based on the data

In [None]:
accval.index(max(accval))

# Logistic Regression

In [None]:
df = pd.read_csv(path_to_train_data)

def cleanup():
    df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
    df.Alley.fillna("None",inplace=True)
    df.MasVnrType.fillna("None",inplace=True)
    df.MasVnrArea.fillna(0.0, inplace=True)
    df.BsmtQual.fillna("None",inplace=True)
    df.BsmtCond.fillna("None",inplace=True)
    df.BsmtExposure.fillna("None",inplace=True)
    df.BsmtFinType1.fillna("None",inplace=True)
    df.BsmtFinType2.fillna("None",inplace=True)
    df.Electrical.fillna("SBrkr",inplace=True)
    df.FireplaceQu.fillna("None",inplace=True)
    df.GarageType.fillna("None",inplace=True)
    df.GarageYrBlt.fillna("None",inplace=True)
    df.GarageFinish.fillna("None",inplace=True)
    df.GarageQual.fillna("None",inplace=True)
    df.GarageCond.fillna("None",inplace=True)
    df.PoolQC.fillna("None",inplace=True)
    df.Fence.fillna("None",inplace=True)
    df.MiscFeature.fillna("None",inplace=True)
    quality_dictionary = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    df['ExterQual']   = df['ExterQual'].map(quality_dictionary).astype(int)
    df['ExterCond']   = df['ExterCond'].map(quality_dictionary).astype(int)
    df['BsmtQual']    = df['BsmtQual'].map(quality_dictionary).astype(int)
    df['BsmtCond']    = df['BsmtCond'].map(quality_dictionary).astype(int)
    df['HeatingQC']   = df['HeatingQC'].map(quality_dictionary).astype(int)
    df['KitchenQual'] = df['KitchenQual'].map(quality_dictionary).astype(int)
    df['FireplaceQu'] = df['FireplaceQu'].map(quality_dictionary).astype(int)
    df['GarageQual']  = df['GarageQual'].map(quality_dictionary).astype(int)
    df['GarageCond']  = df['GarageCond'].map(quality_dictionary).astype(int)
    df['PoolQC']      = df['PoolQC'].map(quality_dictionary).astype(int)

    #saving memory
    del quality_dictionary

    df['BsmtExposure'] = df['BsmtExposure'].map({'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}).astype(int)

    bsmtdict = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
    df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmtdict).astype(int)
    df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmtdict).astype(int)

    df['Functional'] = df['Functional'].map({'None': 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
                                             'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}).astype(int)

    df['GarageFinish'] = df['GarageFinish'].map({'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}).astype(int)
    df['Fence'] = df['Fence'].map({'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}).astype(int)

    del bsmtdict

cleanup()

Feature Engineering and Baseline

In [None]:
df3 = df.copy()
df3['QualAbove5'] = df['OverallQual']

for i in range(df3['QualAbove5'].count()):
    #print(df['YearBuilt'][i],type(df['YearBuilt'][i]))
    if df3['QualAbove5'][i] > 5:
        df3['QualAbove5'][i] = 1
    elif df3['QualAbove5'][i] <= 5:
        df3['QualAbove5'][i] = 0

df3.drop('OverallQual', axis=1, inplace=True)

In [None]:
lyst=[]
for i in range(df3.shape[0]):
    if df3['QualAbove5'][i] == 1:
        pass
    if df3['QualAbove5'][i] == 0:
        pass
    else:
        lyst.append(df3['QualAbove5'][i])
lyst;

In [None]:
df3.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

In [None]:
corrmat = df3.corr()
indices = corrmat.loc['QualAbove5'].copy()
a = pd.DataFrame(indices.sort_values(ascending=False))
plt.figure(figsize=(1,10))
sns.heatmap(a, annot=True)

In [None]:
rowcol = []
for x in range(len(indices)):
    if indices[x] > 0.45:
        rowcol.append(x)
rowcol

In [None]:
rowcol.sort()
indices[rowcol].sort_values(ascending=False)

In [None]:
df2 = df3.copy()
df2.drop('GarageYrBlt', axis=1, inplace=True);

In [None]:
enc = LabelEncoder()
for i in df2.columns:
    df2[i] = enc.fit_transform(df2[i])

In [None]:
Y = df2['QualAbove5']

# X contains all other features, which we will use to predict target
X = df2[['SalePrice','ExterQual','BsmtQual', 'FullBath','YearBuilt','GarageFinish','GarageCars','KitchenQual']]

In [None]:
Y.value_counts()

## Split 1

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=107)

In [None]:
Y_test.value_counts()

In [None]:
base = np.full((Y_test.shape[0],1),1)

In [None]:
X_train;

In [None]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, Y_train)

In [None]:
pred_logit = logit.predict(X_test)
pred_logit

In [None]:
# measure accuracy
score = accuracy_score(y_true = Y_test, y_pred = pred_logit)
score

In [None]:
base2 = accuracy_score(y_true = Y_test, y_pred = base)
base2

In [None]:
(score-base2)/base2

## Split 2

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=177)

In [None]:
Y_test.value_counts()

In [None]:
base = np.full((Y_test.shape[0],1),1)

In [None]:
X_train;

In [None]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, Y_train)

In [None]:
pred_logit = logit.predict(X_test)
pred_logit

In [None]:
# measure accuracy
score = accuracy_score(y_true = Y_test, y_pred = pred_logit)
score

In [None]:
base2 = accuracy_score(y_true = Y_test, y_pred = base)
base2

In [None]:
(score-base2)/base2

## Split 3

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=804)

In [None]:
Y_test.value_counts()

In [None]:
base = np.full((Y_test.shape[0],1),1)

In [None]:
X_train;

In [None]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, Y_train)

In [None]:
pred_logit = logit.predict(X_test)
pred_logit

In [None]:
# measure accuracy
score = accuracy_score(y_true = Y_test, y_pred = pred_logit)
score

In [None]:
base2 = accuracy_score(y_true = Y_test, y_pred = base)
base2

In [None]:
(score-base2)/base2

## Split 4

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=195)

In [None]:
Y_test.value_counts()

In [None]:
base = np.full((Y_test.shape[0],1),1)

In [None]:
X_train;

In [None]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, Y_train)

In [None]:
pred_logit = logit.predict(X_test)
pred_logit

In [None]:
# measure accuracy
score = accuracy_score(y_true = Y_test, y_pred = pred_logit)
score

In [None]:
base2 = accuracy_score(y_true = Y_test, y_pred = base)
base2

In [None]:
(score-base2)/base2

## Split 5

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=838)

In [None]:
Y_test.value_counts()

In [None]:
base = np.full((Y_test.shape[0],1),1)

In [None]:
X_train;

In [None]:
# build model and fit on train set
logit = LogisticRegression()
logit.fit(X_train, Y_train)

In [None]:
pred_logit = logit.predict(X_test)
pred_logit

In [None]:
# measure accuracy
score = accuracy_score(y_true = Y_test, y_pred = pred_logit)
score

In [None]:
base2 = accuracy_score(y_true = Y_test, y_pred = base)
base2

In [None]:
(score-base2)/base2

## Tuning

In [None]:
yag = []
for i in range(1000):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=i)
    base = np.full((Y_test.shape[0],1),1)
    logit = LogisticRegression()
    logit.fit(X_train, Y_train)
    pred_logit = logit.predict(X_test)
    score = accuracy_score(y_true = Y_test, y_pred = pred_logit)
    base2 = accuracy_score(y_true = Y_test, y_pred = base)
    yag.append((score-base2)/base2)

In [None]:
print('The Best Accuracy is: {}%'.format(max(yag)*100))
print('The Worst Accuracy is: {}%'.format(min(yag)*100))

In [None]:
sorted(range(len(yag)), key=lambda x: yag[x])[-5:]

Building the Variables and Model

# Support Vector Machine (SVM)

In [2]:
df = pd.read_csv(path_to_train_data)

def cleanup():
    df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
    df.Alley.fillna("None",inplace=True)
    df.MasVnrType.fillna("None",inplace=True)
    df.MasVnrArea.fillna(0.0, inplace=True)
    df.BsmtQual.fillna("None",inplace=True)
    df.BsmtCond.fillna("None",inplace=True)
    df.BsmtExposure.fillna("None",inplace=True)
    df.BsmtFinType1.fillna("None",inplace=True)
    df.BsmtFinType2.fillna("None",inplace=True)
    df.Electrical.fillna("SBrkr",inplace=True)
    df.FireplaceQu.fillna("None",inplace=True)
    df.GarageType.fillna("None",inplace=True)
    df.GarageYrBlt.fillna("None",inplace=True)
    df.GarageFinish.fillna("None",inplace=True)
    df.GarageQual.fillna("None",inplace=True)
    df.GarageCond.fillna("None",inplace=True)
    df.PoolQC.fillna("None",inplace=True)
    df.Fence.fillna("None",inplace=True)
    df.MiscFeature.fillna("None",inplace=True)
    quality_dictionary = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    df['ExterQual']   = df['ExterQual'].map(quality_dictionary).astype(int)
    df['ExterCond']   = df['ExterCond'].map(quality_dictionary).astype(int)
    df['BsmtQual']    = df['BsmtQual'].map(quality_dictionary).astype(int)
    df['BsmtCond']    = df['BsmtCond'].map(quality_dictionary).astype(int)
    df['HeatingQC']   = df['HeatingQC'].map(quality_dictionary).astype(int)
    df['KitchenQual'] = df['KitchenQual'].map(quality_dictionary).astype(int)
    df['FireplaceQu'] = df['FireplaceQu'].map(quality_dictionary).astype(int)
    df['GarageQual']  = df['GarageQual'].map(quality_dictionary).astype(int)
    df['GarageCond']  = df['GarageCond'].map(quality_dictionary).astype(int)
    df['PoolQC']      = df['PoolQC'].map(quality_dictionary).astype(int)

    #saving memory
    del quality_dictionary

    df['BsmtExposure'] = df['BsmtExposure'].map({'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}).astype(int)

    bsmtdict = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
    df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmtdict).astype(int)
    df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmtdict).astype(int)

    df['Functional'] = df['Functional'].map({'None': 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
                                             'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}).astype(int)

    df['GarageFinish'] = df['GarageFinish'].map({'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}).astype(int)
    df['Fence'] = df['Fence'].map({'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}).astype(int)

    del bsmtdict

cleanup()

Feature Engineering and Baseline

Building the Variables and Model

In [3]:
df['HouseStyle'].value_counts()

1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64

In [4]:
svm = SVC(kernel='linear', C=2, gamma=0.000001)

In [5]:
df2 = df.select_dtypes(include=['float64','int64'])

In [6]:
df2.columns

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'HeatingQC', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [7]:
# Create x, where x the 'scores' column's values as floats
x = df[df2.columns].values.astype(int64)

# Create a minimum and maximum processor object
min_max_scaler = MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x)

# Run the normalizer on the dataframe
df_normalized = pd.DataFrame(x_scaled)

In [8]:
df_sex = pd.get_dummies(df['HouseStyle'])
df_new = pd.concat([df2, df_sex], axis=1)
df_new;

In [9]:
df_new.corr();

In [None]:
corrmat = df_new.corr()
indices = corrmat.loc['1Story'].copy()
a = pd.DataFrame(indices.sort_values(ascending=False))
plt.figure(figsize=(1,10))
sns.heatmap(a, annot=True)

In [None]:
corrmat = df_new.corr()
indices = corrmat.loc['2Story'].copy()
a = pd.DataFrame(indices.sort_values(ascending=False))
plt.figure(figsize=(1,10))
sns.heatmap(a, annot=True)

In [10]:
Y = df['HouseStyle']

# X contains all other features, which we will use to predict target
X = df_new[['2ndFlrSF', '1stFlrSF', 'HalfBath','LowQualFinSF','BsmtExposure']]#df2.drop(['HouseStyle'], axis=1)

## Split1

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=30)

In [16]:
svm.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1e-06, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
predictions = svm.predict(X_test)

In [18]:
print(confusion_matrix(y_test, predictions))

[[ 32   0   1   0   0  13   0   0]
 [  0   0   4   0   0   0   0   0]
 [  0   0 215   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   3   0   0]
 [  7   0   0   2   0 138   0   0]
 [  0   0  10   0   0   0   0   0]
 [  0   0  11   0   0   2   0   0]]


In [19]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

     1.5Fin       0.82      0.70      0.75        46
     1.5Unf       0.00      0.00      0.00         4
     1Story       0.89      1.00      0.94       215
     2.5Fin       0.00      0.00      0.00         0
     2.5Unf       0.00      0.00      0.00         3
     2Story       0.88      0.94      0.91       147
     SFoyer       0.00      0.00      0.00        10
       SLvl       0.00      0.00      0.00        13

avg / total       0.82      0.88      0.85       438



In [20]:
svm.score(X_test,y_test)

0.87899543378995437

In [21]:
base = np.full((438,1),'1Story')
base = accuracy_score(y_true = y_test, y_pred = base)

In [22]:
score = accuracy_score(y_true = y_test, y_pred = predictions)

In [23]:
print(base)
print(score)
print((score/base)-1)

0.490867579909
0.87899543379
0.790697674419


## Split 2

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=39)

In [25]:
svm.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1e-06, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
predictions = svm.predict(X_test)

In [27]:
print(confusion_matrix(y_test, predictions))

[[ 34   0   1   0   0   9   0   0]
 [  0   0   4   0   0   0   0   0]
 [  0   0 221   0   0   0   0   0]
 [  0   0   0   2   0   0   0   0]
 [  1   0   0   0   0   1   0   0]
 [  9   0   0   1   0 128   0   0]
 [  0   0   8   0   0   0   0   0]
 [  1   0  13   0   0   5   0   0]]


In [28]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

     1.5Fin       0.76      0.77      0.76        44
     1.5Unf       0.00      0.00      0.00         4
     1Story       0.89      1.00      0.94       221
     2.5Fin       0.67      1.00      0.80         2
     2.5Unf       0.00      0.00      0.00         2
     2Story       0.90      0.93      0.91       138
     SFoyer       0.00      0.00      0.00         8
       SLvl       0.00      0.00      0.00        19

avg / total       0.81      0.88      0.84       438



In [29]:
svm.score(X_test,y_test)

0.87899543378995437

In [30]:
base = np.full((438,1),'1Story')
base = accuracy_score(y_true = y_test, y_pred = base)

In [31]:
score = accuracy_score(y_true = y_test, y_pred = predictions)

In [32]:
print(base)
print(score)
print((score/base)-1)

0.504566210046
0.87899543379
0.742081447964


## Split 3

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=6)

In [34]:
svm.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1e-06, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
predictions = svm.predict(X_test)

In [36]:
print(confusion_matrix(y_test, predictions))

[[ 42   0   0   0   0   9   0   0]
 [  0   0   3   0   0   0   0   0]
 [  0   0 217   0   0   0   0   0]
 [  0   0   0   1   0   2   0   0]
 [  0   0   0   0   0   5   0   0]
 [  7   0   0   0   0 127   0   0]
 [  0   0   9   0   0   0   0   0]
 [  0   0  12   0   0   4   0   0]]


In [37]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

     1.5Fin       0.86      0.82      0.84        51
     1.5Unf       0.00      0.00      0.00         3
     1Story       0.90      1.00      0.95       217
     2.5Fin       1.00      0.33      0.50         3
     2.5Unf       0.00      0.00      0.00         5
     2Story       0.86      0.95      0.90       134
     SFoyer       0.00      0.00      0.00         9
       SLvl       0.00      0.00      0.00        16

avg / total       0.82      0.88      0.85       438



In [38]:
svm.score(X_test,y_test)

0.88356164383561642

In [39]:
base = np.full((438,1),'1Story')
base = accuracy_score(y_true = y_test, y_pred = base)

In [40]:
score = accuracy_score(y_true = y_test, y_pred = predictions)

In [41]:
print(base)
print(score)
print((score/base)-1)

0.495433789954
0.883561643836
0.783410138249


## Split 4

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=17)

In [43]:
svm.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1e-06, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
predictions = svm.predict(X_test)

In [45]:
print(confusion_matrix(y_test, predictions))

[[ 27   0   1   1   0  11   0   0]
 [  0   0   5   0   0   0   0   0]
 [  0   0 230   0   0   0   0   0]
 [  0   0   0   3   0   0   0   0]
 [  1   0   0   0   0   0   0   0]
 [  8   0   0   0   0 127   0   0]
 [  0   0  10   0   0   0   0   0]
 [  1   0  10   0   0   3   0   0]]


In [46]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

     1.5Fin       0.73      0.68      0.70        40
     1.5Unf       0.00      0.00      0.00         5
     1Story       0.90      1.00      0.95       230
     2.5Fin       0.75      1.00      0.86         3
     2.5Unf       0.00      0.00      0.00         1
     2Story       0.90      0.94      0.92       135
     SFoyer       0.00      0.00      0.00        10
       SLvl       0.00      0.00      0.00        14

avg / total       0.82      0.88      0.85       438



In [47]:
svm.score(X_test,y_test)

0.88356164383561642

In [48]:
base = np.full((438,1),'1Story')
base = accuracy_score(y_true = y_test, y_pred = base)

In [49]:
score = accuracy_score(y_true = y_test, y_pred = predictions)

In [50]:
print(base)
print(score)
print((score/base)-1)

0.525114155251
0.883561643836
0.682608695652


## Split 5

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=32)

In [52]:
svm.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1e-06, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
predictions = svm.predict(X_test)

In [54]:
print(confusion_matrix(y_test, predictions))

[[ 26   0   0   0   0  21   0   0]
 [  0   0   2   0   0   0   0   0]
 [  0   0 224   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0]
 [  0   0   0   0   0   2   0   0]
 [  4   0   0   0   0 137   0   0]
 [  0   0   8   0   0   0   0   0]
 [  1   0   9   0   0   3   0   0]]


In [55]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

     1.5Fin       0.84      0.55      0.67        47
     1.5Unf       0.00      0.00      0.00         2
     1Story       0.92      1.00      0.96       224
     2.5Fin       1.00      1.00      1.00         1
     2.5Unf       0.00      0.00      0.00         2
     2Story       0.84      0.97      0.90       141
     SFoyer       0.00      0.00      0.00         8
       SLvl       0.00      0.00      0.00        13

avg / total       0.83      0.89      0.85       438



In [56]:
svm.score(X_test,y_test)

0.88584474885844744

In [57]:
base = np.full((438,1),'1Story')
base = accuracy_score(y_true = y_test, y_pred = base)

In [58]:
score = accuracy_score(y_true = y_test, y_pred = predictions)

In [59]:
print(base)
print(score)
print((score/base)-1)

0.511415525114
0.885844748858
0.732142857143


# Paragraphs

## Logistic Regression
For our logistic regression, we first filled in all our categorical values in the dataset with numerical equivalents.  Then we created a copy of the original data set, and added a new feature called QualAbove5.  This became a feature that took all values in OverallQual that were greater than 5 and assigned a value of 1, and took all values less than or equal to 5 and assigned a value of 0.  Next, a correlation heatmap was created between QualAbove5 and the other features, and the correlations above 0.45 were used as parameters for the test. This resulted in 8 features, as well as QualAbove5, which was removed due to redundancy.  We then created a split of 0.75 and 0.25, and ran the prediction to find the accuracy score based on a random seed, as well as the baseline.  The accuracy score minus the baseline divided by the baseline shows what percentage more accurate we were over the baseline.


## DecisionTrees
For decision tree, we first created a count of how many of each neighborhood appears in the dataset.  Then the top correlations between the neighborhoods and other features in the float64 datatype were charted. The counts for amount of neighborhood values for each MasVnrArea value was graphed in a chart, as it was the highest correlation.  Finally, the decision trees were created with the Y set being neighborhoods, and the X set being all values with the float64 datatype, with a split of 0.7 and 0.3.


## SVM
For the SVM test, the count of each HouseStyle in the data set is printed.  Then, a copy of the dataset is created with only the features with int64 and float64 values. Dummy variables were generated in a new dataframe to determine the correlation per housestyle with different features. Then a correlation map was generated using the dataframe which allowed us to choose the features ('2ndFlrSF', '1stFlrSF', 'HalfBath','LowQualFinSF','BsmtExposure') that best correlated under a normalization. The test split was generated with 0.3 in the test set and 0.7 in the training set with a C parameter of 2 and gamma value of 0.000001. 

## Tuning

In [None]:
def float_range(start,stop,step):
    x = start
    my_list = []
    if step > 0:
        while x < stop:
            my_list.append(x)
            x += step
    else: # should really be if step < 0 with an extra check for step == 0 
        while x > stop:
            my_list.append(x)
            x += step
    return my_list 