In [57]:
#Import packages
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
lm = LinearRegression()

import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

#Load our two Data Frames
AmesDummies = pd.read_csv('AmesDummies.csv')
AmesDummiesOrdinal = pd.read_csv('AmesDummiesOrdinal.csv')

## First, let's remove obvious features that will not affect our analysis

In [58]:
#Let's try the most basic linear regression, just to get a sense of what the results look like:
AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     82.48
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:10:14   Log-Likelihood:                -16650.
No. Observations:                1458   AIC:                         3.367e+04
Df Residuals:                    1272   BIC:                         3.465e+04
Df Model:                         185                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.598e+

In [59]:
#Let's get rid of "Id" (the index from the Processing DF), and any features with 5 or fewer observations
AmesDummiesOrdinal = AmesDummiesOrdinal.drop('Id', axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     82.99
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:10:14   Log-Likelihood:                -16650.
No. Observations:                1458   AIC:                         3.367e+04
Df Residuals:                    1273   BIC:                         3.465e+04
Df Model:                         184                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.598e+

strong multicollinearity problems or that the design matrix is singular.


In [60]:
#Find the features with 5 or less, remove those columns:
#We see that 256 features have this few, and can be removed (not enough information contained within them)
np.sum(AmesDummiesOrdinal).sort_values()

RoofMatl_Membran         1.000000e+00
Utilities_NoSeWa         1.000000e+00
RoofMatl_Metal           1.000000e+00
RoofMatl_Roll            1.000000e+00
Electrical_Mix           1.000000e+00
Heating_Floor            1.000000e+00
Exterior_Other           1.000000e+00
Exterior_CBlock          1.000000e+00
SaleType_Con             2.000000e+00
Heating_OthW             2.000000e+00
Neighborhood_Blueste     2.000000e+00
Condition_RRNe           2.000000e+00
RoofStyle_Shed           2.000000e+00
Electrical_FuseP         3.000000e+00
SaleType_Oth             3.000000e+00
Foundation_Wood          3.000000e+00
Exterior_AsphShn         3.000000e+00
SaleType_CWD             4.000000e+00
MSSubClass_40            4.000000e+00
LotConfig_FR3            4.000000e+00
Heating_Wall             4.000000e+00
SaleCondition_AdjLand    4.000000e+00
SaleType_ConLI           5.000000e+00
RoofMatl_WdShake         5.000000e+00
SaleType_ConLw           5.000000e+00
Foundation_Stone         6.000000e+00
GarageType_2

In [61]:
#Remove these 26 features:
AmesDummiesOrdinal = AmesDummiesOrdinal.drop(list(np.sum(AmesDummiesOrdinal).sort_values()[0:26].index), axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.912
Method:                 Least Squares   F-statistic:                     96.17
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:10:15   Log-Likelihood:                -16667.
No. Observations:                1458   AIC:                         3.365e+04
Df Residuals:                    1299   BIC:                         3.449e+04
Df Model:                         158                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.556e+

In [62]:
'''Finally, we want to eliminate GarageAge, and GarageType_No. This information should be covered be other factors like
garage car size and garage quality. And the GarageAge is highly confounding, since there is no way to quantify the age
of a garage that is not built. Removing these will help to clairfy these issues.'''
AmesDummiesOrdinal = AmesDummiesOrdinal.drop(['GarageAge', 'GarageType_No'], axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     97.01
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:10:15   Log-Likelihood:                -16671.
No. Observations:                1458   AIC:                         3.366e+04
Df Residuals:                    1301   BIC:                         3.448e+04
Df Model:                         156                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.245e+

In [63]:
#At this point, we have 158 possible features and one dependent variable ('SalePrice')
AmesDummiesOrdinal.shape

(1458, 158)

## Now, let's look for features with high multicollinearity and address them. We could manually do this by removing features with high VIF one at a time, or could manually search through the correlation of each variable with others first. Let's compare the two methods

In [64]:
#This is an imported function, found online, to check VIF for a given DF and remove features with too high a VIF:

def calculate_vif_(X, thresh=100):
    X2 = X.copy()
    cols = X2.columns
    variables = np.arange(X2.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X2[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X2[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc) + " -- VIF: " + str(max(vif)))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X2.columns[variables])

    return X2[cols[variables]]

In [65]:
#Create an AmesDummies DF with VIF's under 10, removing one at a time, using the above function:
AmesDummiesVIFUnder10 = calculate_vif_(AmesDummiesOrdinal.drop('SalePrice', axis=1), thresh=10)

  vif = 1. / (1. - r_squared_i)


dropping 'MSSubClass_90' at index: 29 -- VIF: inf
dropping 'GarageCond' at index: 149 -- VIF: 457.68839837237175
dropping 'BsmtCond' at index: 141 -- VIF: 173.27490137498856
dropping 'Functional' at index: 144 -- VIF: 162.2540059001739
dropping 'PoolQC' at index: 148 -- VIF: 149.02973576303435
dropping 'ExterQual' at index: 138 -- VIF: 128.70531537718702
dropping 'TotRmsAbvGrd' at index: 7 -- VIF: 103.61613865667212
dropping 'BsmtQual' at index: 138 -- VIF: 103.04604962542909
dropping 'ExterCond' at index: 137 -- VIF: 99.18429395723699
dropping 'OverallQual' at index: 135 -- VIF: 89.88959415150568
dropping 'GarageQual' at index: 141 -- VIF: 77.77691213552646
dropping 'TotalSF' at index: 18 -- VIF: 74.96718196538951
dropping 'KitchenAbvGr' at index: 6 -- VIF: 73.83218038227011
dropping 'KitchenQual' at index: 136 -- VIF: 68.8077909021263
dropping 'SaleType_New' at index: 103 -- VIF: 48.85029187910767
dropping 'BldgType_TwnhsE' at index: 71 -- VIF: 46.96866790622756
dropping 'GarageCars'

In [66]:
'''This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,
size of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to 
discriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this 
manually.'''

"This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,\nsize of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to \ndiscriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this \nmanually."

In [67]:
#Devise a function to produce a correlation matrix for our feature DF, then go manually
def CreateCorrelationMatrix(df, dependent):
    df2 = df.drop(dependent, axis=1)
    for i in range(len(df2.columns)):
        corrarray = []
        indexarray = []
        for j in range(len(df2.columns)):
            corr12 = df2[df2.columns[i]].corr(df2[df2.columns[j]])
            corrarray.append(corr12)
            indexarray.append(df2.columns[j])
        seriesi = pd.Series(corrarray, index=indexarray)
        
        if i > 0:
            corrDF = pd.concat([corrDF, seriesi], axis=1)
        
        else:
            corrDF = pd.DataFrame(seriesi)
     
    #Rename the columns to be the same as the indices (a self matrix)
    corrDF.columns = corrDF.index
    
    #reset all self-covariances to 0
    for var in corrDF.columns:
        corrDF.loc[var, var] = 0
    
    return corrDF   

In [68]:
#Create the overall correlation matrix, first with all features:
corrDF0 = CreateCorrelationMatrix(AmesDummiesOrdinal, 'SalePrice')

In [69]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['BldgType_Duplex'].sort_values(ascending=False) #Correlation=1 with MSSubClass_90. Keep Duplex, more descriptive

MSSubClass_90            1.000000
KitchenAbvGr             0.680826
SaleCondition_Alloca     0.269022
Foundation_Slab          0.265779
HouseStyle_SFoyer        0.180613
GarageType_CarPort       0.173716
BedroomAbvGr             0.162948
TotRmsAbvGrd             0.140644
Exterior_Plywood         0.138467
YearsSinceRemodel        0.135742
CentralAir_N             0.129047
Neighborhood_Mitchel     0.128309
Foundation_CBlock        0.122261
Condition_Feedr          0.109990
Condition_RRAe           0.105284
Street_Grvl              0.103178
Exterior_Stone           0.103178
GarageType_2Types        0.103178
MSZoning_RH              0.086241
Electrical_FuseF         0.083313
SaleCondition_Family     0.072707
GarageType_Detchd        0.068659
TotalBath                0.067289
Neighborhood_Edwards     0.066536
Neighborhood_Sawyer      0.056626
Neighborhood_SawyerW     0.054349
MiscVal                  0.053173
MasVnrType_BrkFace       0.041191
RoofStyle_Mansard        0.040146
Condition_RRNn

In [70]:
#Remove MSSubClass_90, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [71]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['SaleType_New'].sort_values(ascending=False) #SaleType_New is essentially SaleCondition_Partial (mentioned in the txt also)

SaleCondition_Partial    0.986622
ExterQual                0.383292
KitchenQual              0.331326
OverallQual              0.318733
BsmtQual                 0.318201
MasVnrType_Stone         0.316456
GarageArea               0.286702
Neighborhood_NridgHt     0.286350
GarageCars               0.284252
Neighborhood_Somerst     0.263993
GarageFinish             0.255536
HeatingQC                0.246404
TotalBsmtSF              0.243780
MSZoning_FV              0.201338
TotalBath                0.160086
FireplaceQu              0.159025
OpenPorchSF              0.154176
MasVnrArea               0.151289
TotalSF                  0.147546
BsmtExposure             0.141295
TotRmsAbvGrd             0.135468
YearsSinceSale           0.133281
Exterior_CemntBd         0.124389
Neighborhood_StoneBr     0.114237
GarageType_BuiltIn       0.114201
MSSubClass_60            0.096390
LandContour_HLS          0.094415
BsmtCond                 0.094066
LotFrontage              0.093117
Neighborhood_B

In [72]:
#Remove SaleCondition_Partial, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [73]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['BldgType_2fmCon'].sort_values(ascending=False) #2FmCon and MS190 virtually identical, remove MS190

MSSubClass_190          0.983394
KitchenAbvGr            0.357100
CentralAir_N            0.250054
Neighborhood_OldTown    0.206217
YearsAgoBuilt           0.206000
SaleType_ConLD          0.170493
Heating_GasW            0.112693
LandContour_Bnk         0.111681
YearsSinceRemodel       0.098584
MSZoning_RM             0.098195
HouseStyle_2.5Unf       0.097041
Foundation_BrkTil       0.093381
Neighborhood_SWISU      0.090406
BedroomAbvGr            0.088246
Exterior_MetalSd        0.083105
LotArea                 0.079635
Alley_Grvl              0.076730
MSZoning_RH             0.075750
HouseStyle_1.5Fin       0.073102
EnclosedPorch           0.066921
TotRmsAbvGrd            0.065225
GarageType_2Types       0.064794
Street_Grvl             0.064794
Alley_Pave              0.061209
MoSold_Winter           0.059386
Heating_Grav            0.058546
Electrical_FuseA        0.058105
Exterior_AsbShng        0.057654
LowQualFinSF            0.055481
Exterior_Wd Sdng        0.055108
          

In [74]:
#Remove MSSubClass_190, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [75]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['GarageQual'].sort_values(ascending=False) #Garage Quality and Condition are highly correlated. Remove Cond

GarageCond               0.959168
GarageCars               0.576854
GarageArea               0.562323
GarageFinish             0.482590
PavedDrive               0.364906
OverallQual              0.293001
KitchenQual              0.238181
FireplaceQu              0.221594
ExterQual                0.219204
Fireplaces               0.208801
TotalSF                  0.182337
BsmtQual                 0.181995
TotalBsmtSF              0.180119
TotalBath                0.177261
HeatingQC                0.145242
MasVnrType_BrkFace       0.143242
BsmtScore                0.140359
MSSubClass_60            0.139951
MasVnrArea               0.134908
BsmtCond                 0.128709
WoodDeckSF               0.119304
LotFrontage              0.107317
ExterCond                0.095018
TotRmsAbvGrd             0.090744
GarageType_Detchd        0.087049
LotShape_IR1             0.084162
RoofStyle_Hip            0.081324
Functional               0.081108
MasVnrType_Stone         0.080807
LotArea       

In [76]:
#Remove GarageCond, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [77]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['MSSubClass_80'].sort_values(ascending=False) #MS80 is basically split-level by the description, remove it

HouseStyle_SLvl         0.942256
BsmtExposure            0.207097
Fence                   0.134409
MasVnrType_BrkFace      0.124200
Exterior_HdBoard        0.120161
Foundation_CBlock       0.118779
GarageType_BuiltIn      0.111692
PoolArea                0.107756
GarageType_Basment      0.100387
PoolQC                  0.099835
Exterior_Plywood        0.078483
LotShape_IR1            0.072964
BsmtScore               0.069965
RoofMatl_Tar&Grv        0.063364
Neighborhood_Veenker    0.063364
GarageQual              0.063183
LotConfig_FR2           0.062195
Neighborhood_Gilbert    0.059796
Neighborhood_Mitchel    0.059406
PavedDrive              0.059019
OverallCond             0.058598
Fireplaces              0.057720
RoofStyle_Flat          0.055357
WoodDeckSF              0.053886
LotFrontage             0.052958
Neighborhood_NWAmes     0.049819
BsmtCond                0.049425
Neighborhood_ClearCr    0.048229
RoofMatl_WdShngl        0.041734
ExterCond               0.041441
          

In [78]:
#Remove MSSubClass_80, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [79]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['MSSubClass_50'].sort_values(ascending=False) #MS50 is basically a 1.5 story, can be removed

HouseStyle_1.5Fin       0.940862
YearsAgoBuilt           0.388089
Foundation_BrkTil       0.287803
GarageType_Detchd       0.264364
YearsSinceRemodel       0.240322
Neighborhood_BrkSide    0.226683
Condition_Artery        0.209483
MSZoning_RM             0.189980
Exterior_Wd Sdng        0.182173
Neighborhood_OldTown    0.170573
Neighborhood_SWISU      0.168779
Alley_Grvl              0.165003
EnclosedPorch           0.153434
Neighborhood_IDOTRR     0.151231
OverallCond             0.130206
Exterior_MetalSd        0.128448
LowQualFinSF            0.126585
LandContour_Bnk         0.114536
Heating_GasW            0.108722
Exterior_AsbShng        0.105687
Electrical_FuseA        0.100310
Neighborhood_Edwards    0.094758
Electrical_FuseF        0.090944
MSZoning_C (all)        0.083907
Fence                   0.082856
BedroomAbvGr            0.079610
RoofStyle_Gambrel       0.077406
Exterior_WdShing        0.069738
CentralAir_N            0.061638
Condition_Feedr         0.054993
          

In [80]:
#Remove MSSubClass_50, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [81]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['MSSubClass_45'].sort_values(ascending=False) #MS45 is basically a 1.5 story unfinished, can be removed

HouseStyle_1.5Unf       0.925180
Neighborhood_BrkSide    0.136827
Foundation_BrkTil       0.121348
MSZoning_RM             0.110822
YearsAgoBuilt           0.110569
Heating_Grav            0.103497
Electrical_FuseF        0.100106
YearsSinceRemodel       0.091978
EnclosedPorch           0.090051
Exterior_MetalSd        0.088172
Exterior_Wd Sdng        0.086839
GarageType_Detchd       0.082777
Neighborhood_IDOTRR     0.081842
Condition_RRAe          0.075726
Electrical_FuseA        0.068818
CentralAir_N            0.068227
MSZoning_RH             0.063273
LandContour_Bnk         0.056796
OverallCond             0.048335
Fence                   0.045381
Neighborhood_OldTown    0.030377
YearsSinceSale          0.027398
Exterior_WdShing        0.026354
Condition_Artery        0.025737
LandContour_HLS         0.024549
Alley_Grvl              0.024549
Neighborhood_Crawfor    0.023975
Functional              0.021610
Neighborhood_SawyerW    0.019818
LandSlope_Mod           0.017105
          

In [82]:
#Remove MSSubClass_45, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [83]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['PoolQC'].sort_values(ascending=False) #We are now in correlation values below 0.9, though many rough guides offer 0.7-0.75 as the target range. For PoolQC and PoolArea, 0.9 is too high, remove PoolQC

PoolArea                 0.889372
Exterior_ImStucc         0.171477
Fence                    0.150793
LotFrontage              0.137671
TotalSF                  0.135635
MSSubClass_75            0.133547
RoofMatl_Tar&Grv         0.121050
SaleCondition_Alloca     0.115495
RoofStyle_Flat           0.110578
EnclosedPorch            0.110460
LowQualFinSF             0.110273
SaleCondition_Abnorml    0.098738
Exterior_Stucco          0.094069
HouseStyle_SLvl          0.093189
Neighborhood_NoRidge     0.078122
TotalBath                0.073033
Condition_Artery         0.070814
BedroomAbvGr             0.067494
Fireplaces               0.063614
Exterior_Plywood         0.062850
OverallQual              0.062126
TotRmsAbvGrd             0.060412
ExterCond                0.058648
TotalBsmtSF              0.052568
WoodDeckSF               0.052567
YearsSinceSale           0.052196
MoSold_Summer            0.051464
Neighborhood_Mitchel     0.049704
KitchenQual              0.048637
LotConfig_Corn

In [84]:
#Remove PoolQC, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [85]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['GarageArea'].sort_values(ascending=False) #Garage cars and area are natrually highly correlated. I think SF is a more precise measure, remove GarageCars

GarageCars               0.887304
GarageQual               0.562323
OverallQual              0.557230
GarageFinish             0.515014
ExterQual                0.490152
KitchenQual              0.485394
TotalBsmtSF              0.475069
TotalSF                  0.467003
TotalBath                0.445022
BsmtQual                 0.402643
MasVnrArea               0.360786
TotRmsAbvGrd             0.328714
FireplaceQu              0.325672
Neighborhood_NridgHt     0.304615
HeatingQC                0.294097
SaleType_New             0.286702
PavedDrive               0.285929
MasVnrType_Stone         0.276093
LotFrontage              0.273354
Fireplaces               0.260455
MSSubClass_60            0.259451
BsmtExposure             0.243954
OpenPorchSF              0.228246
WoodDeckSF               0.222482
BsmtScore                0.211759
MasVnrType_BrkFace       0.197647
Neighborhood_NoRidge     0.187507
Neighborhood_Somerst     0.187128
LotArea                  0.163680
RoofStyle_Hip 

In [86]:
#Remove GarageCars, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [87]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['FireplaceQu'].sort_values(ascending=False) 
''''FireplaceQu is related to Number of fireplaces. Reading through a detailed description of why this is, the conclusion is that
FireplaceQu would be the slighly better indicator. Often, the "2nd" fireplace would be a small pre=fab fireplace or Franklin
Stove in the basement. FireplaceQu lists the quality of the best fireplace in the house. We should keep that.'''

'\'FireplaceQu is related to Number of fireplaces. Reading through a detailed description of why this is, the conclusion is that\nFireplaceQu would be the slighly better indicator. Often, the "2nd" fireplace would be a small pre=fab fireplace or Franklin\nStove in the basement. FireplaceQu lists the quality of the best fireplace in the house. We should keep that.'

In [88]:
#Remove Fireplaces, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [89]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['Neighborhood_Somerst'].sort_values(ascending=False) #Neighborhood_Somerset may be a largely "floating village" residenial neighborhood. The actual neighborhood should contain more value, so eliminate MSZoning_FV

MSZoning_FV             0.862798
Alley_Pave              0.380048
SaleType_New            0.263993
MSSubClass_160          0.261784
ExterQual               0.255474
OverallQual             0.226226
KitchenQual             0.211949
OpenPorchSF             0.210631
HeatingQC               0.205186
MasVnrType_Stone        0.202739
BsmtQual                0.193376
GarageArea              0.187128
HouseStyle_2Story       0.144763
TotalBath               0.123583
BldgType_TwnhsE         0.122271
GarageFinish            0.119458
BldgType_Twnhs          0.111223
Exterior_CemntBd        0.107625
Exterior_MetalSd        0.080263
PavedDrive              0.072596
MSSubClass_60           0.068535
MasVnrArea              0.067308
Condition_RRNn          0.066843
GarageQual              0.065779
BsmtCond                0.061158
Functional              0.059390
Condition_RRAn          0.051985
TotalBsmtSF             0.048589
TotalSF                 0.044407
LotConfig_FR2           0.036718
          

In [90]:
#Remove MSZoning_FV, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [91]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['RoofMatl_Tar&Grv'].sort_values(ascending=False) #These two must be mostly related. "Flat" seems a better descriptor than Tar%Grv, however

RoofStyle_Flat           0.834912
Neighborhood_ClearCr     0.161061
LandSlope_Sev            0.160367
Exterior_Plywood         0.150557
PoolArea                 0.147589
BsmtExposure             0.143992
LandContour_Low          0.139358
Exterior_BrkComm         0.108613
LotConfig_CulDSac        0.106208
LotFrontage              0.096534
LandSlope_Mod            0.096383
Condition_PosA           0.094326
GarageType_CarPort       0.094326
LotArea                  0.090877
LotShape_IR2             0.081061
SaleCondition_Alloca     0.079788
ScreenPorch              0.073772
Neighborhood_Edwards     0.071561
MasVnrType_BrkCmn        0.069661
ExterCond                0.069511
Heating_GasW             0.062033
OpenPorchSF              0.060008
GarageType_Basment       0.059872
HouseStyle_SLvl          0.057977
LotShape_IR1             0.056516
Foundation_CBlock        0.051432
Foundation_Slab          0.051014
MoSold_Autumn            0.050262
Electrical_FuseF         0.046817
SaleCondition_

In [92]:
#Remove RoofMatl_Tar&Grv, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV', 'RoofMatl_Tar&Grv']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [93]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['TotRmsAbvGrd'].sort_values(ascending=False) #Total Rooms should naturally correlate with SF, and SF seems a much better descriptor

TotalSF                  0.824153
BedroomAbvGr             0.680390
TotalBath                0.453608
HouseStyle_2Story        0.428040
MSSubClass_60            0.427227
OverallQual              0.420621
FireplaceQu              0.353191
GarageArea               0.328714
ExterQual                0.289587
LotFrontage              0.283025
KitchenQual              0.280642
MasVnrArea               0.268116
TotalBsmtSF              0.266146
GarageType_BuiltIn       0.262684
KitchenAbvGr             0.258633
GarageFinish             0.237125
OpenPorchSF              0.220052
HouseStyle_2.5Fin        0.189177
Neighborhood_NoRidge     0.187622
BsmtQual                 0.180135
LotArea                  0.175425
Neighborhood_NridgHt     0.170205
WoodDeckSF               0.163218
HeatingQC                0.162304
MSSubClass_75            0.150084
BldgType_Duplex          0.140644
MSSubClass_70            0.139669
SaleType_New             0.135468
LowQualFinSF             0.132558
MasVnrType_Sto

In [94]:
#Remove RoofMatl_Tar&Grv, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV', 'RoofMatl_Tar&Grv', 'TotRmsAbvGrd']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [95]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
#The next 3 are all MSSubClasses, which we should eliminate

MSSubClass_120          0.778663
BldgType_TwnhsE         0.778663
HouseStyle_2Story       0.761882
MSSubClass_60           0.761882
HouseStyle_SFoyer       0.730856
MSSubClass_85           0.730856
ExterQual               0.723286
OverallQual             0.723286
KitchenQual             0.713812
KitchenAbvGr            0.680826
BldgType_Duplex         0.680826
MSSubClass_75           0.675558
HouseStyle_2.5Unf       0.675558
BsmtQual                0.634737
BsmtCond                0.634737
Exterior_BrkComm        0.627943
Neighborhood_NPkVill    0.627943
BldgType_Twnhs          0.620917
MSSubClass_160          0.620917
TotalBath               0.600230
TotalSF                 0.600230
YearsSinceRemodel       0.592105
YearsAgoBuilt           0.592105
LowQualFinSF            0.590458
HouseStyle_2.5Fin       0.590458
MasVnrArea              0.580502
MasVnrType_BrkFace      0.580502
TotalBsmtSF             0.577795
GarageArea              0.562323
GarageQual              0.562323
          

In [96]:
#Remove additional MSSubclasses, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV', 'RoofMatl_Tar&Grv', 'TotRmsAbvGrd',
                  'MSSubClass_120', 'MSSubClass_85', 'MSSubClass_60']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [97]:
#Check the matrix out, eliminate a variable then rerun the process:
print(np.max(corrDF0).sort_values(ascending=False))
'''Exterior, Kitchen, and Overall quality are significantly correlated with each other (~0.7), but there is not enough
information here to decide for sure whether to eliminate any. They should be kept for now and moved to the Feature Selection
part of the analysis'''

ExterQual               0.723286
OverallQual             0.723286
KitchenQual             0.713812
KitchenAbvGr            0.680826
BldgType_Duplex         0.680826
MSSubClass_75           0.675558
HouseStyle_2.5Unf       0.675558
BsmtCond                0.634737
BsmtQual                0.634737
Exterior_BrkComm        0.627943
Neighborhood_NPkVill    0.627943
MSSubClass_160          0.620917
BldgType_Twnhs          0.620917
TotalBath               0.600230
TotalSF                 0.600230
YearsSinceRemodel       0.592105
YearsAgoBuilt           0.592105
LowQualFinSF            0.590458
HouseStyle_2.5Fin       0.590458
MasVnrArea              0.580502
MasVnrType_BrkFace      0.580502
TotalBsmtSF             0.577795
GarageQual              0.562323
GarageArea              0.562323
MSZoning_RM             0.561814
Neighborhood_OldTown    0.561814
GarageFinish            0.555118
Foundation_BrkTil       0.554784
LandSlope_Sev           0.547996
LotArea                 0.547996
          

'Exterior, Kitchen, and Overall quality are significantly correlated with each other (~0.7), but there is not enough\ninformation here to decide for sure whether to eliminate any. They should be kept for now and moved to the Feature Selection\npart of the analysis'

In [98]:
AmesDummiesMultiReduction.shape

(1458, 142)

In [99]:
'''After removing obvious features, our DF went from 187 to 158 feature columns. Applying a correlation analysis, this number
was reduced to 142. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques
to decide which ones contribute meaningfully to our model'''

'After removing obvious features, our DF went from 187 to 158 feature columns. Applying a correlation analysis, this number\nwas reduced to 142. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques\nto decide which ones contribute meaningfully to our model'

In [100]:
#We see that the DF we have created, eliminating >40 sparse and co-linear features, still retains a high R^2 (0.883) and lowered AIC
AmesDummiesOrdinalX = AmesDummiesMultiReduction.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.918
Model:                            OLS   Adj. R-squared:                  0.909
Method:                 Least Squares   F-statistic:                     103.9
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:13:58   Log-Likelihood:                -16700.
No. Observations:                1458   AIC:                         3.368e+04
Df Residuals:                    1316   BIC:                         3.443e+04
Df Model:                         141                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -8.926e+

## Now, we need to select features to use in our analysis. Let's first try a method of Forward selection, where we add features that add the most to AIC, until we have no more features that will add value.

In [101]:
#Define method to add features one at a time based on which subtract the most from AIC:
def AddFeatureListbyAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    ListofPossibleFeatures = list(df2X.columns)
    StartingFeatureList = []
    CreatedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        ListOfTriedValues = []
        if len(CreatedFeatureList) > 0:
            X2 = sm.add_constant(df2X[CreatedFeatureList])
            est = sm.OLS(df2Y, X2)
            est2 = est.fit()
            AICBase = est2.aic
            AICList = []
        else:
            AICBase = 1000000
            AICList = []
        
        for i in ListofPossibleFeatures:
            if i in CreatedFeatureList:
                continue
            tempDFX = df2X[CreatedFeatureList]
            tempDFX = pd.concat([tempDFX, df2X[[i]]], axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            ListOfTriedValues.append(i)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            AddedValue = ListOfTriedValues[index]
        CreatedFeatureList.append(AddedValue)
        
        AICEvolutionList.append(AICList[index])
        #df2X = df2X.drop(AddedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'CreatedFeatures': np.array(CreatedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [102]:
AmesDummiesForwardAICList = AddFeatureListbyAIC(AmesDummiesMultiReduction, 'SalePrice')

In [103]:
AmesDummiesForwardAICList

Unnamed: 0,CreatedFeatures,NewFScore
0,OverallQual,35580.655191
1,TotalSF,35050.594689
2,TotalBsmtSF,34745.984888
3,KitchenQual,34607.918603
4,BsmtExposure,34492.775386
5,SaleType_New,34429.856225
6,BsmtScore,34364.663709
7,LotArea,34313.134833
8,MasVnrArea,34264.390052
9,BedroomAbvGr,34221.173659


In [104]:
#This could be a promising method to use, with an R^2 of still 0.88 despite eliminating more than half the features
AmesDummiesForwardAIC = pd.concat([AmesDummiesMultiReduction[list(AmesDummiesForwardAICList['CreatedFeatures'])],
                                              AmesDummiesMultiReduction['SalePrice']], axis=1)

AmesDummiesOrdinalX = AmesDummiesForwardAIC.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesForwardAIC['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     223.7
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:15:35   Log-Likelihood:                -16721.
No. Observations:                1458   AIC:                         3.358e+04
Df Residuals:                    1390   BIC:                         3.394e+04
Df Model:                          67                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -8.089e+

In [105]:
'''Looking at the above statsmodels report, MSZoning_RM, though added early, must have been superseded by a later feature
as it is completely insignificant. Let's remove this.'''

"Looking at the above statsmodels report 1)MSZoning_RM, though added early, must have been superseded by a later feature\nas it is completely insignificant, and 2) The final 10 feaures are also insignificant and likely contributing just as much\nnoise as value to the model. Let's remove these 11 features and use this as our final feature list."

In [107]:
AmesDummiesSelect = AmesDummiesForwardAIC.drop(['MSZoning_RM'], axis=1)

AmesDummiesSelectX = AmesDummiesSelect.drop('SalePrice', axis=1)
AmesDummiesSelectY = AmesDummiesSelect['SalePrice']

import statsmodels.api as sm
X = AmesDummiesSelectX
Y = AmesDummiesSelectY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     227.2
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:41:46   Log-Likelihood:                -16722.
No. Observations:                1458   AIC:                         3.358e+04
Df Residuals:                    1391   BIC:                         3.393e+04
Df Model:                          66                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -8.046e+

In [108]:
'''Having Central Air would seem to be a significant boost to a home, yet it vanishes in nearly every method used. We 
wanted to verify that adding it here would serve to lower the quality of the analysis, as somewhat of a test. Again, it
did. This suggests that other co-linear factors absorb Central Air and render that metric insignificant, which is surprising.'''
AmesDummiesSelecttest = pd.concat([AmesDummiesSelect, AmesDummiesOrdinal['CentralAir_N']], axis=1)

AmesDummiesSelectX = AmesDummiesSelecttest.drop('SalePrice', axis=1)
AmesDummiesSelectY = AmesDummiesSelect['SalePrice']

import statsmodels.api as sm
X = AmesDummiesSelectX
Y = AmesDummiesSelectY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.915
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     223.7
Date:                Tue, 13 Nov 2018   Prob (F-statistic):               0.00
Time:                        13:42:10   Log-Likelihood:                -16721.
No. Observations:                1458   AIC:                         3.358e+04
Df Residuals:                    1390   BIC:                         3.394e+04
Df Model:                          67                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -7.981e+

## Would our selected features differ drastically if we selected using Backwards selection?  If so, we should take any differences into account

In [109]:
#Define method to remove features based on those that most increase AIC:
def TrimFeatureListByAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    StartingFeatureList = list(df2X.columns)
    RemovedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        X2 = sm.add_constant(df2X)
        est = sm.OLS(df2Y, X2)
        est2 = est.fit()
        AICBase = est2.aic
        AICList = []
        
        for i in StartingFeatureList:
            tempDFX = df2X.drop(labels=i, axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            RemovedValue = StartingFeatureList[index]
        RemovedFeatureList.append(RemovedValue)
        AICEvolutionList.append(AICList[index])
        df2X = df2X.drop(RemovedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'RemovedFeatures': np.array(RemovedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [110]:
AmesDummiesBackwardAICList = TrimFeatureListByAIC(AmesDummiesMultiReduction, 'SalePrice')

In [111]:
AmesDummiesBackwardAICList

Unnamed: 0,RemovedFeatures,NewFScore
0,LandContour_Bnk,33681.630846
1,FireplaceQu,33679.631015
2,HouseStyle_1.5Unf,33677.632220
3,Alley_Pave,33675.636524
4,SaleCondition_Alloca,33673.641198
5,Electrical_FuseF,33671.647179
6,Exterior_BrkComm,33669.655899
7,LotConfig_Corner,33667.669677
8,Exterior_HdBoard,33665.684523
9,Alley_Grvl,33663.701359


In [112]:
AmesDummiesForwardAICList.to_csv('AmesDummiesForwardAICList.csv')
AmesDummiesBackwardAICList.to_csv('AmesDummiesBackwardAICRemovalList.csv')