In [20]:
#Import packages
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
lm = LinearRegression()

import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

#Load our two Data Frames
AmesDummies = pd.read_csv('AmesDummies.csv')
AmesDummiesOrdinal = pd.read_csv('AmesDummiesOrdinal.csv')

## First, let's remove obvious features that will not affect our analysis

In [21]:
#Let's try the most basic linear regression, just to get a sense of what the results look like:
AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.909
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     68.53
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        15:14:35   Log-Likelihood:                -16793.
No. Observations:                1460   AIC:                         3.396e+04
Df Residuals:                    1273   BIC:                         3.495e+04
Df Model:                         186                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.695e+

In [22]:
#Let's get rid of "Id" (the index from the Processing DF), and any features with 5 or fewer observations
AmesDummiesOrdinal = AmesDummiesOrdinal.drop('Id', axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.909
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     68.96
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        15:14:38   Log-Likelihood:                -16793.
No. Observations:                1460   AIC:                         3.396e+04
Df Residuals:                    1274   BIC:                         3.494e+04
Df Model:                         185                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.694e+

In [23]:
#Find the features with 5 or less, remove those columns:
#We see that 256 features have this few, and can be removed (not enough information contained within them)
np.sum(AmesDummiesOrdinal).sort_values()

RoofMatl_Membran         1.000000e+00
Exterior_Other           1.000000e+00
Exterior_CBlock          1.000000e+00
Utilities_NoSeWa         1.000000e+00
Electrical_Mix           1.000000e+00
RoofMatl_ClyTile         1.000000e+00
Heating_Floor            1.000000e+00
RoofMatl_Metal           1.000000e+00
RoofMatl_Roll            1.000000e+00
Condition_RRNe           2.000000e+00
Neighborhood_Blueste     2.000000e+00
SaleType_Con             2.000000e+00
RoofStyle_Shed           2.000000e+00
Heating_OthW             2.000000e+00
SaleType_Oth             3.000000e+00
Foundation_Wood          3.000000e+00
Electrical_FuseP         3.000000e+00
Exterior_AsphShn         3.000000e+00
MSSubClass_40            4.000000e+00
SaleCondition_AdjLand    4.000000e+00
Heating_Wall             4.000000e+00
SaleType_CWD             4.000000e+00
LotConfig_FR3            4.000000e+00
SaleType_ConLI           5.000000e+00
RoofMatl_WdShake         5.000000e+00
SaleType_ConLw           5.000000e+00
GarageType_2

In [24]:
#Remove these 26 features:
AmesDummiesOrdinal = AmesDummiesOrdinal.drop(list(np.sum(AmesDummiesOrdinal).sort_values()[0:26].index), axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.888
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     64.79
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        15:15:02   Log-Likelihood:                -16946.
No. Observations:                1460   AIC:                         3.421e+04
Df Residuals:                    1300   BIC:                         3.506e+04
Df Model:                         159                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.555e+

In [25]:
'''Finally, we want to eliminate GarageAge, and GarageType_No. This information should be covered be other factors like
garage car size and garage quality. And the GarageAge is highly confounding, since there is no way to quantify the age
of a garage that is not built. Removing these will help to clairfy these issues.'''
AmesDummiesOrdinal = AmesDummiesOrdinal.drop(['GarageAge', 'GarageType_No'], axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     65.28
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        15:15:05   Log-Likelihood:                -16951.
No. Observations:                1460   AIC:                         3.422e+04
Df Residuals:                    1302   BIC:                         3.505e+04
Df Model:                         157                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.105e+

In [26]:
#At this point, we have 158 possible features and one dependent variable ('SalePrice')
AmesDummiesOrdinal.shape

(1460, 159)

## Now, let's look for features with high multicollinearity and address them. We could manually do this by removing features with high VIF one at a time, or could manually search through the correlation of each variable with others first. Let's compare the two methods

In [27]:
#This is an imported function, found online, to check VIF for a given DF and remove features with too high a VIF:

def calculate_vif_(X, thresh=100):
    X2 = X.copy()
    cols = X2.columns
    variables = np.arange(X2.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X2[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X2[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc) + " -- VIF: " + str(max(vif)))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X2.columns[variables])

    return X2[cols[variables]]

In [28]:
#Create an AmesDummies DF with VIF's under 10, removing one at a time, using the above function:
AmesDummiesVIFUnder10 = calculate_vif_(AmesDummiesOrdinal.drop('SalePrice', axis=1), thresh=10)

  vif = 1. / (1. - r_squared_i)


dropping 'MSSubClass_90' at index: 29 -- VIF: inf
dropping 'GarageCond' at index: 150 -- VIF: 459.03712838358035
dropping 'BsmtCond' at index: 142 -- VIF: 172.67617520112313
dropping 'Functional' at index: 145 -- VIF: 162.06411104628316
dropping 'PoolQC' at index: 149 -- VIF: 146.61485859584639
dropping 'ExterQual' at index: 139 -- VIF: 129.98461820685057
dropping 'TotRmsAbvGrd' at index: 7 -- VIF: 103.03441716372268
dropping 'BsmtQual' at index: 139 -- VIF: 102.33741433709656
dropping 'ExterCond' at index: 138 -- VIF: 100.07895642202665
dropping 'OverallQual' at index: 136 -- VIF: 89.93011480373025
dropping 'GarageQual' at index: 142 -- VIF: 77.83730769324787
dropping 'TotalSF' at index: 18 -- VIF: 74.8522834418384
dropping 'KitchenAbvGr' at index: 6 -- VIF: 74.21238095627282
dropping 'KitchenQual' at index: 137 -- VIF: 68.98984514578132
dropping 'SaleType_New' at index: 104 -- VIF: 49.68163726957637
dropping 'BldgType_TwnhsE' at index: 71 -- VIF: 47.22031254353584
dropping 'PavedDriv

In [None]:
'''This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,
size of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to 
discriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this 
manually.'''

In [29]:
#Devise a function to produce a correlation matrix for our feature DF, then go manually
def CreateCorrelationMatrix(df, dependent):
    df2 = df.drop(dependent, axis=1)
    for i in range(len(df2.columns)):
        corrarray = []
        indexarray = []
        for j in range(len(df2.columns)):
            corr12 = df2[df2.columns[i]].corr(df2[df2.columns[j]])
            corrarray.append(corr12)
            indexarray.append(df2.columns[j])
        seriesi = pd.Series(corrarray, index=indexarray)
        
        if i > 0:
            corrDF = pd.concat([corrDF, seriesi], axis=1)
        
        else:
            corrDF = pd.DataFrame(seriesi)
     
    #Rename the columns to be the same as the indices (a self matrix)
    corrDF.columns = corrDF.index
    
    #reset all self-covariances to 0
    for var in corrDF.columns:
        corrDF.loc[var, var] = 0
    
    return corrDF   

In [30]:
#Create the overall correlation matrix, first with all features:
corrDF0 = CreateCorrelationMatrix(AmesDummiesOrdinal, 'SalePrice')

In [32]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['BldgType_Duplex'].sort_values(ascending=False) #Correlation=1 with MSSubClass_90. Keep Duplex, more descriptive

MSSubClass_90            1.000000
KitchenAbvGr             0.680844
SaleCondition_Alloca     0.269037
Foundation_Slab          0.265803
HouseStyle_SFoyer        0.180647
GarageType_CarPort       0.173731
BedroomAbvGr             0.162898
TotRmsAbvGrd             0.138913
Exterior_Plywood         0.138543
YearsSinceRemodel        0.135917
CentralAir_N             0.129107
Neighborhood_Mitchel     0.128352
Foundation_CBlock        0.122424
Condition_Feedr          0.108888
Condition_RRAe           0.105305
Street_Grvl              0.103192
GarageType_2Types        0.103192
Exterior_Stone           0.103192
MSZoning_RH              0.086265
Electrical_FuseF         0.083346
SaleCondition_Family     0.072736
GarageType_Detchd        0.068798
TotalBath                0.066125
Neighborhood_Edwards     0.064939
Neighborhood_Sawyer      0.056684
Neighborhood_SawyerW     0.054400
MiscVal                  0.053195
MasVnrType_BrkFace       0.041353
Condition_RRNn           0.040163
RoofStyle_Mans

In [33]:
#Remove MSSubClass_90, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [35]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['SaleType_New'].sort_values(ascending=False) #SaleType_New is essentially SaleCondition_Partial (mentioned in the txt also)

SaleCondition_Partial    0.986819
ExterQual                0.391048
KitchenQual              0.337878
OverallQual              0.327412
MasVnrType_Stone         0.326471
BsmtQual                 0.322988
GarageArea               0.296671
GarageCars               0.286290
Neighborhood_NridgHt     0.283105
TotalBsmtSF              0.265644
Neighborhood_Somerst     0.260852
GarageFinish             0.259782
HeatingQC                0.248458
MSZoning_FV              0.198831
TotalSF                  0.173150
OpenPorchSF              0.171467
TotalBath                0.171194
MasVnrArea               0.170709
FireplaceQu              0.163126
BsmtExposure             0.149988
TotRmsAbvGrd             0.147496
YearsSinceSale           0.133337
Exterior_CemntBd         0.132797
GarageType_BuiltIn       0.121122
LotFrontage              0.116530
Neighborhood_StoneBr     0.112773
MSSubClass_60            0.104358
BsmtCond                 0.093890
LandContour_HLS          0.092845
Neighborhood_B

In [36]:
#Remove SaleCondition_Partial, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [40]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['BldgType_2fmCon'].sort_values(ascending=False) #2FmCon and MS190 virtually identical, remove MS190

MSSubClass_190          0.983395
KitchenAbvGr            0.357126
CentralAir_N            0.250091
Neighborhood_OldTown    0.206261
YearsAgoBuilt           0.206036
SaleType_ConLD          0.170505
Heating_GasW            0.112713
LandContour_Bnk         0.109016
YearsSinceRemodel       0.098724
MSZoning_RM             0.098267
HouseStyle_2.5Unf       0.097056
Foundation_BrkTil       0.093440
Neighborhood_SWISU      0.090430
BedroomAbvGr            0.088210
Exterior_MetalSd        0.083179
LotArea                 0.077767
Alley_Grvl              0.076765
MSZoning_RH             0.075769
HouseStyle_1.5Fin       0.073164
EnclosedPorch           0.066987
GarageType_2Types       0.064806
Foundation_Stone        0.064806
Street_Grvl             0.064806
TotRmsAbvGrd            0.064178
Alley_Pave              0.061241
MoSold_Winter           0.059064
Heating_Grav            0.058559
Electrical_FuseA        0.058154
Exterior_AsbShng        0.057678
LowQualFinSF            0.055503
          

In [41]:
#Remove MSSubClass_190, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [42]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['GarageQual'].sort_values(ascending=False) #Garage Quality and Condition are highly correlated. Remove Cond

GarageQual              0.959172
GarageCond              0.959172
MSSubClass_80           0.942259
HouseStyle_SLvl         0.942259
HouseStyle_1.5Fin       0.940871
MSSubClass_50           0.940871
HouseStyle_1.5Unf       0.925181
MSSubClass_45           0.925181
PoolQC                  0.899924
PoolArea                0.899924
GarageArea              0.882475
GarageCars              0.882475
FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
          

In [43]:
#Remove GarageCond, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [44]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['MSSubClass_80'].sort_values(ascending=False) #MS80 is basically split-level by the description, remove it

MSSubClass_80           0.942259
HouseStyle_SLvl         0.942259
MSSubClass_50           0.940871
HouseStyle_1.5Fin       0.940871
HouseStyle_1.5Unf       0.925181
MSSubClass_45           0.925181
PoolQC                  0.899924
PoolArea                0.899924
GarageArea              0.882475
GarageCars              0.882475
FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
BldgType_Duplex         0.680844
KitchenAbvGr            0.680844
BedroomAbvGr            0.676620
          

In [45]:
#Remove MSSubClass_80, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [46]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['MSSubClass_50'].sort_values(ascending=False) #MS50 is basically a 1.5 story, can be removed

HouseStyle_1.5Fin       0.940871
MSSubClass_50           0.940871
HouseStyle_1.5Unf       0.925181
MSSubClass_45           0.925181
PoolQC                  0.899924
PoolArea                0.899924
GarageArea              0.882475
GarageCars              0.882475
FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotalSF                 0.820088
TotRmsAbvGrd            0.820088
MSSubClass_120          0.778684
BldgType_TwnhsE         0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
BldgType_Duplex         0.680844
BedroomAbvGr            0.676620
HouseStyle_2.5Unf       0.675562
MSSubClass_75           0.675562
          

In [47]:
#Remove MSSubClass_50, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [48]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['MSSubClass_45'].sort_values(ascending=False) #MS45 is basically a 1.5 story unfinished, can be removed

HouseStyle_1.5Unf       0.925181
MSSubClass_45           0.925181
PoolQC                  0.899924
PoolArea                0.899924
GarageArea              0.882475
GarageCars              0.882475
FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
MSSubClass_120          0.778684
BldgType_TwnhsE         0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
BldgType_Duplex         0.680844
BedroomAbvGr            0.676620
HouseStyle_2.5Unf       0.675562
MSSubClass_75           0.675562
BsmtCond                0.633713
BsmtQual                0.633713
          

In [49]:
#Remove MSSubClass_45, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [50]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['PoolQC'].sort_values(ascending=False) #We are now in correlation values below 0.9, though many rough guides offer 0.7-0.75 as the target range. For PoolQC and PoolArea, 0.9 is too high, remove PoolQC

PoolQC                  0.899924
PoolArea                0.899924
GarageArea              0.882475
GarageCars              0.882475
FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
BldgType_Duplex         0.680844
KitchenAbvGr            0.680844
BedroomAbvGr            0.676620
MSSubClass_75           0.675562
HouseStyle_2.5Unf       0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
          

In [51]:
#Remove PoolQC, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [52]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['GarageArea'].sort_values(ascending=False) #Garage cars and area are natrually highly correlated. I think SF is a more precise measure, remove GarageCars

GarageArea              0.882475
GarageCars              0.882475
FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
BldgType_Duplex         0.680844
KitchenAbvGr            0.680844
BedroomAbvGr            0.676620
HouseStyle_2.5Unf       0.675562
MSSubClass_75           0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
          

In [53]:
#Remove GarageCars, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [54]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['FireplaceQu'].sort_values(ascending=False) 
''''FireplaceQu is related to Number of fireplaces. Reading through a detailed description of why this is, the conclusion is that
FireplaceQu would be the slighly better indicator. Often, the "2nd" fireplace would be a small pre=fab fireplace or Franklin
Stove in the basement. FireplaceQu lists the quality of the best fireplace in the house. We should keep that.'''

FireplaceQu             0.863241
Fireplaces              0.863241
Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
BldgType_Duplex         0.680844
BedroomAbvGr            0.676620
MSSubClass_75           0.675562
HouseStyle_2.5Unf       0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
TotalBath               0.603942
YearsAgoBuilt           0.592855
          

In [55]:
#Remove Fireplaces, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [57]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['Neighborhood_Somerst'].sort_values(ascending=False) #Neighborhood_Somerset may be a largely "floating village" residenial neighborhood. The actual neighborhood should contain more value, so eliminate MSZoning_FV

Neighborhood_Somerst    0.862807
MSZoning_FV             0.862807
RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
MSSubClass_120          0.778684
BldgType_TwnhsE         0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
BldgType_Duplex         0.680844
BedroomAbvGr            0.676620
HouseStyle_2.5Unf       0.675562
MSSubClass_75           0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
TotalBath               0.603942
YearsSinceRemodel       0.592855
YearsAgoBuilt           0.592855
LowQualFinSF            0.590462
          

In [58]:
#Remove MSZoning_FV, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [59]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['RoofMatl_Tar&Grv'].sort_values(ascending=False) #These two must be mostly related. "Flat" seems a better descriptor than Tar%Grv, however

RoofMatl_Tar&Grv        0.834914
RoofStyle_Flat          0.834914
TotRmsAbvGrd            0.820088
TotalSF                 0.820088
MSSubClass_120          0.778684
BldgType_TwnhsE         0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
BldgType_Duplex         0.680844
BedroomAbvGr            0.676620
MSSubClass_75           0.675562
HouseStyle_2.5Unf       0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
TotalBath               0.603942
YearsAgoBuilt           0.592855
YearsSinceRemodel       0.592855
HouseStyle_2.5Fin       0.590462
LowQualFinSF            0.590462
MasVnrArea              0.569474
          

In [60]:
#Remove RoofMatl_Tar&Grv, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV', 'RoofMatl_Tar&Grv']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [61]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['TotRmsAbvGrd'].sort_values(ascending=False) #Total Rooms should naturally correlate with SF, and SF seems a much better descriptor

TotRmsAbvGrd            0.820088
TotalSF                 0.820088
BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
BldgType_Duplex         0.680844
KitchenAbvGr            0.680844
BedroomAbvGr            0.676620
HouseStyle_2.5Unf       0.675562
MSSubClass_75           0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
TotalBath               0.603942
YearsSinceRemodel       0.592855
YearsAgoBuilt           0.592855
LowQualFinSF            0.590462
HouseStyle_2.5Fin       0.590462
MasVnrType_BrkFace      0.569474
MasVnrArea              0.569474
TotalBsmtSF             0.564299
          

In [62]:
#Remove RoofMatl_Tar&Grv, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV', 'RoofMatl_Tar&Grv', 'TotRmsAbvGrd']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [64]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
#The next 3 are all MSSubClasses, which we should eliminate

BldgType_TwnhsE         0.778684
MSSubClass_120          0.778684
HouseStyle_2Story       0.762743
MSSubClass_60           0.762743
HouseStyle_SFoyer       0.730862
MSSubClass_85           0.730862
ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
KitchenAbvGr            0.680844
BldgType_Duplex         0.680844
HouseStyle_2.5Unf       0.675562
MSSubClass_75           0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
TotalBath               0.603942
TotalSF                 0.603942
YearsSinceRemodel       0.592855
YearsAgoBuilt           0.592855
LowQualFinSF            0.590462
HouseStyle_2.5Fin       0.590462
MasVnrType_BrkFace      0.569474
MasVnrArea              0.569474
TotalBsmtSF             0.564299
GarageArea              0.562022
Neighborhood_OldTown    0.561881
          

In [65]:
#Remove additional MSSubclasses, rerun the analysis, and create a list of removed features
RemovedFeatures = ['MSSubClass_90', 'SaleCondition_Partial', 'MSSubClass_190', 'GarageCond', 'MSSubClass_80', 'MSSubClass_50',
                  'MSSubClass_45', 'PoolQC', 'GarageCars', 'Fireplaces', 'MSZoning_FV', 'RoofMatl_Tar&Grv', 'TotRmsAbvGrd',
                  'MSSubClass_120', 'MSSubClass_85', 'MSSubClass_60']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [68]:
#Check the matrix out, eliminate a variable then rerun the process:
print(np.max(corrDF0).sort_values(ascending=False))
'''Exterior, Kitchen, and Overall quality are significantly correlated with each other (~0.7), but there is not enough
information here to decide for sure whether to eliminate any. They should be kept for now and moved to the Feature Selection
part of the analysis'''

ExterQual               0.726278
OverallQual             0.726278
KitchenQual             0.716122
BldgType_Duplex         0.680844
KitchenAbvGr            0.680844
MSSubClass_75           0.675562
HouseStyle_2.5Unf       0.675562
BsmtCond                0.633713
BsmtQual                0.633713
Exterior_BrkComm        0.627945
Neighborhood_NPkVill    0.627945
BldgType_Twnhs          0.620936
MSSubClass_160          0.620936
TotalSF                 0.603942
TotalBath               0.603942
YearsSinceRemodel       0.592855
YearsAgoBuilt           0.592855
HouseStyle_2.5Fin       0.590462
LowQualFinSF            0.590462
MasVnrArea              0.569474
MasVnrType_BrkFace      0.569474
TotalBsmtSF             0.564299
GarageArea              0.562022
Neighborhood_OldTown    0.561881
MSZoning_RM             0.561881
GarageQual              0.558938
GarageFinish            0.556863
Foundation_BrkTil       0.554742
LotArea                 0.540380
LandSlope_Sev           0.540380
          

'Exterior, Kitchen, and Overall quality are significantly correlated with each other (~0.7), but there is not enough\ninformation here to decide for sure whether to eliminate any. They should be kept for now and moved to the Feature Selection\npart of the analysis'

In [69]:
AmesDummiesMultiReduction.shape

(1460, 143)

In [70]:
'''After removing obvious features, our DF went from 187 to 158 feature columns. Applying a correlation analysis, this number
was reduced to 142. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques
to decide which ones contribute meaningfully to our model'''

'After removing obvious features, our DF went from 187 to 158 feature columns. Applying a correlation analysis, this number\nwas reduced to 142. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques\nto decide which ones contribute meaningfully to our model'

In [74]:
#We see that the DF we have created, eliminating >40 sparse and co-linear features, still retains a high R^2 (0.883) and lowered AIC
AmesDummiesOrdinalX = AmesDummiesMultiReduction.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.883
Model:                            OLS   Adj. R-squared:                  0.871
Method:                 Least Squares   F-statistic:                     70.10
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        16:01:17   Log-Likelihood:                -16977.
No. Observations:                1460   AIC:                         3.424e+04
Df Residuals:                    1317   BIC:                         3.500e+04
Df Model:                         142                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                   -9.7e+

## Now, we need to select features to use in our analysis. Let's first try a method of Forward selection, where we add features that add the most to AIC, until we have no more features that will add value.

In [75]:
#Define method to add features one at a time based on which subtract the most from AIC:
def AddFeatureListbyAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    ListofPossibleFeatures = list(df2X.columns)
    StartingFeatureList = []
    CreatedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        ListOfTriedValues = []
        if len(CreatedFeatureList) > 0:
            X2 = sm.add_constant(df2X[CreatedFeatureList])
            est = sm.OLS(df2Y, X2)
            est2 = est.fit()
            AICBase = est2.aic
            AICList = []
        else:
            AICBase = 1000000
            AICList = []
        
        for i in ListofPossibleFeatures:
            if i in CreatedFeatureList:
                continue
            tempDFX = df2X[CreatedFeatureList]
            tempDFX = pd.concat([tempDFX, df2X[[i]]], axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            ListOfTriedValues.append(i)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            AddedValue = ListOfTriedValues[index]
        CreatedFeatureList.append(AddedValue)
        
        AICEvolutionList.append(AICList[index])
        #df2X = df2X.drop(AddedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'CreatedFeatures': np.array(CreatedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [76]:
AmesDummiesForwardAICList = AddFeatureListbyAIC(AmesDummiesMultiReduction, 'SalePrice')

In [77]:
AmesDummiesForwardAICList

Unnamed: 0,CreatedFeatures,NewFScore
0,OverallQual,35657.492532
1,TotalSF,35250.778438
2,BsmtExposure,35105.287270
3,KitchenQual,34980.031887
4,TotalBsmtSF,34896.740611
5,Neighborhood_NridgHt,34838.120854
6,Neighborhood_NoRidge,34765.791609
7,MSZoning_RM,34718.798580
8,Neighborhood_StoneBr,34679.858775
9,GarageArea,34641.847707


In [78]:
#This could be a promising method to use, with an R^2 of still 0.88 despite eliminating more than half the features
AmesDummiesForwardAIC = pd.concat([AmesDummiesMultiReduction[list(AmesDummiesForwardAICList['CreatedFeatures'])],
                                              AmesDummiesMultiReduction['SalePrice']], axis=1)

AmesDummiesOrdinalX = AmesDummiesForwardAIC.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesForwardAIC['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.880
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     152.2
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        16:11:42   Log-Likelihood:                -16997.
No. Observations:                1460   AIC:                         3.413e+04
Df Residuals:                    1392   BIC:                         3.449e+04
Df Model:                          67                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                -1.061e+05 

In [80]:
'''Looking at the above statsmodels report 1)MSZoning_RM, though added early, must have been superseded by a later feature
as it is completely insignificant, and 2) The final 10 feaures are also insignificant and likely contributing just as much
noise as value to the model. Let's remove these 11 features and use this as our final feature list.'''

"Looking at the above statsmodels report 1)MSZoning_RM, though added early, must have been superseded by a later feature\nas it is completely insignificant, and 2) The final 10 feaures are also insignificant and likely contributing just as much\nnoise as value to the model. Let's remove these 11 features and use this as our final feature list."

In [82]:
AmesDummiesSelect = AmesDummiesForwardAIC.drop(['MSZoning_RM', 'MasVnrType_BrkCmn', 'Exterior_WdShing', 'LotShape_IR2',
                                                   'LotConfig_FR2', 'GarageType_2Types', 'LandContour_HLS', 'MSSubClass_75',
                                                   'HouseStyle_2.5Fin', 'BedroomAbvGr'], axis=1)

AmesDummiesSelectX = AmesDummiesSelect.drop('SalePrice', axis=1)
AmesDummiesSelectY = AmesDummiesSelect['SalePrice']

import statsmodels.api as sm
X = AmesDummiesSelectX
Y = AmesDummiesSelectY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.878
Model:                            OLS   Adj. R-squared:                  0.873
Method:                 Least Squares   F-statistic:                     176.7
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        16:21:01   Log-Likelihood:                -17009.
No. Observations:                1460   AIC:                         3.413e+04
Df Residuals:                    1402   BIC:                         3.444e+04
Df Model:                          57                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                -1.053e+05 

In [83]:
'''Having Central Air would seem to be a significant boost to a home, yet it vanishes in nearly every method used. We 
wanted to verify that adding it here would serve to lower the quality of the analysis, as somewhat of a test. Again, it
did. This suggests that other co-linear factors absorb Central Air and render that metric insignificant, which is surprising.'''
AmesDummiesSelecttest = pd.concat([AmesDummiesSelect, AmesDummiesOrdinal['CentralAir_N']], axis=1)

AmesDummiesSelectX = AmesDummiesSelecttest.drop('SalePrice', axis=1)
AmesDummiesSelectY = AmesDummiesSelect['SalePrice']

import statsmodels.api as sm
X = AmesDummiesSelectX
Y = AmesDummiesSelectY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.878
Model:                            OLS   Adj. R-squared:                  0.873
Method:                 Least Squares   F-statistic:                     173.6
Date:                Mon, 12 Nov 2018   Prob (F-statistic):               0.00
Time:                        16:22:56   Log-Likelihood:                -17009.
No. Observations:                1460   AIC:                         3.414e+04
Df Residuals:                    1401   BIC:                         3.445e+04
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                -1.059e+05 

## Would our selected features differ drastically if we selected using Backwards selection?  If so, we should take any differences into account

In [84]:
#Define method to remove features based on those that most increase AIC:
def TrimFeatureListByAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    StartingFeatureList = list(df2X.columns)
    RemovedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        X2 = sm.add_constant(df2X)
        est = sm.OLS(df2Y, X2)
        est2 = est.fit()
        AICBase = est2.aic
        AICList = []
        
        for i in StartingFeatureList:
            tempDFX = df2X.drop(labels=i, axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            RemovedValue = StartingFeatureList[index]
        RemovedFeatureList.append(RemovedValue)
        AICEvolutionList.append(AICList[index])
        df2X = df2X.drop(RemovedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'RemovedFeatures': np.array(RemovedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [86]:
AmesDummiesBackwardAICList = TrimFeatureListByAIC(AmesDummiesMultiReduction, 'SalePrice')

In [87]:
AmesDummiesBackwardAICList

Unnamed: 0,RemovedFeatures,NewFScore
0,EnclosedPorch,34237.524033
1,MSZoning_RM,34235.524270
2,Neighborhood_SWISU,34233.524832
3,Exterior_BrkComm,34231.525616
4,Foundation_Stone,34229.528812
5,CentralAir_N,34227.533086
6,Fence,34225.537305
7,Foundation_CBlock,34223.542765
8,MiscVal,34221.549659
9,Condition_RRNn,34219.558993
