In [1]:
#Import packages
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
lm = LinearRegression()

import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

#Load our two Data Frames
AmesDummiesOrdinal = pd.read_csv('AmesOrdinalManualFeatures.csv')

In [2]:
np.random.seed(19)
testIdxes = np.random.choice(range(1458), size= 292, replace=False)
trainIdxes = list(set(range(1458))-set(testIdxes))

In [3]:
#Take out the train dataset from the overall AmesDummiesOrdinalDF:
AmesDummiesOrdinal = AmesDummiesOrdinal.iloc[0:1458,]

## First, let's remove obvious features that will not affect our analysis

In [4]:
#Let's try the most basic linear regression, just to get a sense of what the results look like:
#THIS IS WITH THE WHOLE DATA SET -- WE WILL CUT TO OUR TRAIN SET NEXT
AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     94.91
Date:                Sat, 17 Nov 2018   Prob (F-statistic):               0.00
Time:                        12:02:03   Log-Likelihood:                -16676.
No. Observations:                1458   AIC:                         3.367e+04
Df Residuals:                    1299   BIC:                         3.451e+04
Df Model:                         158                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.866e+

In [5]:
#Let's get rid of "Id" (the index from the Processing DF), and any features with 5 or fewer observations
AmesDummiesOrdinal = AmesDummiesOrdinal.drop('Id', axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     95.59
Date:                Sat, 17 Nov 2018   Prob (F-statistic):               0.00
Time:                        12:02:05   Log-Likelihood:                -16676.
No. Observations:                1458   AIC:                         3.367e+04
Df Residuals:                    1300   BIC:                         3.450e+04
Df Model:                         157                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.864e+

In [6]:
#Now, to Feature Select, let's trim our AmesDummiesOrdinal to the Train subset:
AmesDummiesOrdinalAll = AmesDummiesOrdinal.iloc[trainIdxes,]
AmesDummiesOrdinal = AmesDummiesOrdinalAll.copy()

In [7]:
#Start Trimming Values from DFSelect:
np.sum(AmesDummiesOrdinal).sort_values()

Exterior_Other           0.000000e+00
Electrical_Mix           0.000000e+00
Exterior_CBlock          0.000000e+00
RoofMatl_Metal           0.000000e+00
RoofMatl_ClyTile         0.000000e+00
Neighborhood_Blueste     1.000000e+00
RoofMatl_Roll            1.000000e+00
RoofMatl_Membran         1.000000e+00
Heating_Floor            1.000000e+00
SaleCondition_AdjLand    2.000000e+00
Heating_OthW             2.000000e+00
Heating_Wall             2.000000e+00
RoofStyle_Shed           2.000000e+00
Foundation_Wood          3.000000e+00
LotConfig_FR3            3.000000e+00
Electrical_FuseP         3.000000e+00
Exterior_AsphShn         3.000000e+00
SaleType_Oth             3.000000e+00
SaleType_CWD             3.000000e+00
Street_Grvl              4.000000e+00
Exterior_Stone           4.000000e+00
GarageType_2Types        5.000000e+00
RoofMatl_WdShake         5.000000e+00
Foundation_Stone         5.000000e+00
Heating_Grav             5.000000e+00
Neighborhood_NPkVill     6.000000e+00
RoofMatl_WdS

In [8]:
#Remove 19 lowest features with three or fewer entries:
#No change at all from our initial R^2 score(0.921)
AmesDummiesOrdinal = AmesDummiesOrdinal.drop(list(np.sum(AmesDummiesOrdinal).sort_values()[0:19].index), axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.910
Method:                 Least Squares   F-statistic:                     85.90
Date:                Sat, 17 Nov 2018   Prob (F-statistic):               0.00
Time:                        12:02:08   Log-Likelihood:                -13368.
No. Observations:                1166   AIC:                         2.702e+04
Df Residuals:                    1026   BIC:                         2.772e+04
Df Model:                         139                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.799e+

In [9]:
#At this point, we have 140 possible features for the trimmed dataset, 159 features for the All dataset.
print(AmesDummiesOrdinal.shape)
print(AmesDummiesOrdinalAll.shape)

(1166, 140)
(1166, 159)


## Now, let's look for features with high multicollinearity and address them. We could manually do this by removing features with high VIF one at a time, or could manually search through the correlation of each variable with others first. Let's compare the two methods

In [10]:
#This is an imported function, found online, to check VIF for a given DF and remove features with too high a VIF:

def calculate_vif_(X, thresh=10):
    X2 = X.copy()
    cols = X2.columns
    variables = np.arange(X2.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X2[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X2[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc) + " -- VIF: " + str(max(vif)))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X2.columns[variables])

    return X2[cols[variables]]

In [11]:
#Create an AmesDummies DF with VIF's under 10, removing one at a time, using the above function:
AmesDummiesVIFUnder10 = calculate_vif_(AmesDummiesOrdinal.drop('SalePrice', axis=1), thresh=10)

dropping 'Utilities' at index: 3 -- VIF: 799.8574542479943
dropping 'Functional' at index: 19 -- VIF: 160.04660937781313
dropping 'GarageScore' at index: 30 -- VIF: 130.7361558624787
dropping 'ExterQual' at index: 7 -- VIF: 125.69861239118016
dropping 'TotRmsAbvGrd' at index: 17 -- VIF: 109.64161499897851
dropping 'ExterCond' at index: 7 -- VIF: 101.75529992900287
dropping 'BsmtCond' at index: 8 -- VIF: 97.3173282458821
dropping 'OverallQual' at index: 4 -- VIF: 92.9846722282848
dropping 'KitchenAbvGr' at index: 12 -- VIF: 80.13945739032549
dropping 'TotalSF' at index: 25 -- VIF: 80.00294891113893
dropping 'KitchenQual' at index: 12 -- VIF: 68.2822819026303
dropping 'BsmtQual' at index: 6 -- VIF: 65.6339848805433
dropping 'SaleCondition_Partial' at index: 102 -- VIF: 58.581472829636496
dropping 'OverallCond' at index: 4 -- VIF: 42.85906372995171
dropping 'HeatingQC' at index: 7 -- VIF: 33.141888114020745
dropping 'LandSlope' at index: 3 -- VIF: 31.572448463235332
dropping 'TotalBath' a

In [12]:
'''This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,
size of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to 
discriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this 
manually.'''

"This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,\nsize of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to \ndiscriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this \nmanually."

In [13]:
#Devise a function to produce a correlation matrix for our feature DF, then go manually
def CreateCorrelationMatrix(df, dependent):
    df2 = df.drop(dependent, axis=1)
    for i in range(len(df2.columns)):
        corrarray = []
        indexarray = []
        for j in range(len(df2.columns)):
            corr12 = df2[df2.columns[i]].corr(df2[df2.columns[j]])
            corrarray.append(corr12)
            indexarray.append(df2.columns[j])
        seriesi = pd.Series(corrarray, index=indexarray)
        
        if i > 0:
            corrDF = pd.concat([corrDF, seriesi], axis=1)
        
        else:
            corrDF = pd.DataFrame(seriesi)
     
    #Rename the columns to be the same as the indices (a self matrix)
    corrDF.columns = corrDF.index
    
    #reset all self-covariances to 0
    for var in corrDF.columns:
        corrDF.loc[var, var] = 0
    
    return corrDF   

In [14]:
#Create the overall correlation matrix, first with all features:
corrDF0 = CreateCorrelationMatrix(AmesDummiesOrdinal, 'SalePrice')

In [15]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['SaleCondition_Partial'].sort_values(ascending=False) #ConditionPartial and SaleTtpe_New are identical, remove PArtial

SaleType_New             0.989498
ExterQual                0.386580
MasVnrType_Stone         0.345491
KitchenQual              0.334770
BsmtQual                 0.329907
OverallQual              0.320584
Neighborhood_Somerst     0.308288
GarageArea               0.306883
Neighborhood_NridgHt     0.273002
HeatingQC                0.256347
TotalBsmtSF              0.249306
MSZoning_FV              0.233403
OpenPorchSF              0.178578
TotalBath                0.166290
FireplaceQu              0.162773
TotalSF                  0.159605
BsmtExposure             0.156860
MasVnrArea               0.155437
GarageScore              0.152904
TotRmsAbvGrd             0.140202
Exterior_CemntBd         0.120722
Neighborhood_StoneBr     0.119965
YearsSinceSale           0.119783
Neighborhood_Blmngtn     0.109265
BsmtCond                 0.105650
LandContour_HLS          0.105544
GarageType_BuiltIn       0.104210
LotFrontage              0.089330
MoSold_Autumn            0.083060
PavedDrive    

In [16]:
#Remove MSSubClass_90, rerun the analysis, and create a list of removed features
RemovedFeatures = ['SaleCondition_Partial']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [17]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['PoolQC'].sort_values(ascending=False) #PoolQC is highly correlated with area. Remove one

PoolArea                 0.865471
Exterior_ImStucc         0.227122
Fence_GdPrv              0.170041
LotFrontage              0.158238
TotalSF                  0.145239
LowQualFinSF             0.131141
SaleCondition_Abnorml    0.126722
EnclosedPorch            0.122838
Exterior_Stucco          0.114928
Neighborhood_NoRidge     0.097979
TotalBath                0.092250
Condition_Artery         0.088702
OverallQual              0.074093
TotalBsmtSF              0.074083
Fireplaces               0.068902
LotConfig_Corner         0.066639
BedroomAbvGr             0.063978
Neighborhood_Mitchel     0.062773
TotRmsAbvGrd             0.060665
HouseStyle_SLvl          0.054707
Fence_Minimal            0.052148
GarageArea               0.050147
KitchenQual              0.048052
HouseStyle_2Story        0.047995
BsmtScore                0.047967
YearsSinceSale           0.045212
Exterior_Plywood         0.041435
LotArea                  0.041058
FireplaceQu              0.038629
ExterCond     

In [18]:
#Remove SaleCondition_Partial, rerun the analysis, and create a list of removed features
RemovedFeatures = ['SaleCondition_Partial', 'PoolArea']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [19]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['Fireplaces'].sort_values(ascending=False) #Fireplaces are highly correlated with FireplaceQu. Remove fireplaces

FireplaceQu              0.860687
TotalSF                  0.471399
OverallQual              0.403248
TotalBath                0.336740
TotRmsAbvGrd             0.332021
TotalBsmtSF              0.324257
GarageScore              0.292265
GarageArea               0.272657
ExterQual                0.262464
KitchenQual              0.258422
MasVnrArea               0.246046
LotArea                  0.244124
BsmtQual                 0.230774
LotFrontage              0.219505
Neighborhood_Crawfor     0.199797
WoodDeckSF               0.190946
ScreenPorch              0.172727
LotShape                 0.166882
BsmtExposure             0.166410
OpenPorchSF              0.166013
Neighborhood_NridgHt     0.161327
PavedDrive               0.149161
BsmtScore                0.148121
MasVnrType_BrkFace       0.145348
RoofStyle_Hip            0.140405
HeatingQC                0.139617
Neighborhood_NoRidge     0.132460
GarageType_BuiltIn       0.130016
Neighborhood_NWAmes      0.124932
BedroomAbvGr  

In [20]:
#Remove MSSubClass_190, rerun the analysis, and create a list of removed features
RemovedFeatures = ['SaleCondition_Partial', 'PoolArea', 'Fireplaces']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [21]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
#One other cause for concern is Kitchen Abv Gr, which looks like, and should, highly correlate with duplexes"
RemovedFeatures = ['SaleCondition_Partial', 'PoolArea', 'Fireplaces', 'KitchenAbvGr']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
AmesDummiesOrdinal = AmesDummiesMultiReduction.copy()

In [22]:
print(AmesDummiesOrdinal.shape)
print(AmesDummiesOrdinalAll.shape)

(1166, 136)
(1166, 159)


In [23]:
'''After removing obvious features, our DF went from 159 to 136 feature columns. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques
to decide which ones contribute meaningfully to our model'''

'After removing obvious features, our DF went from 159 to 136 feature columns. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques\nto decide which ones contribute meaningfully to our model'

In [24]:
#We see that the DF we have created, eliminating >40 sparse and co-linear features, still retains a high R^2 (0.923 to 0.920) and lowered AIC (33670 to 27040)
#Basically no R^2 has been lost, even though we've decreased AIC significantly
AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.910
Method:                 Least Squares   F-statistic:                     87.81
Date:                Sat, 17 Nov 2018   Prob (F-statistic):               0.00
Time:                        12:25:10   Log-Likelihood:                -13374.
No. Observations:                1166   AIC:                         2.702e+04
Df Residuals:                    1030   BIC:                         2.771e+04
Df Model:                         135                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.975e+

## Now, we need to select features to use in our analysis. Let's first try a method of Forward selection, where we add features that add the most to AIC, until we have no more features that will add value.

In [25]:
#Define method to add features one at a time based on which subtract the most from AIC:
def AddFeatureListbyAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    ListofPossibleFeatures = list(df2X.columns)
    StartingFeatureList = []
    CreatedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        ListOfTriedValues = []
        if len(CreatedFeatureList) > 0:
            X2 = sm.add_constant(df2X[CreatedFeatureList])
            est = sm.OLS(df2Y, X2)
            est2 = est.fit()
            AICBase = est2.aic
            AICList = []
        else:
            AICBase = 1000000
            AICList = []
        
        for i in ListofPossibleFeatures:
            if i in CreatedFeatureList:
                continue
            tempDFX = df2X[CreatedFeatureList]
            tempDFX = pd.concat([tempDFX, df2X[[i]]], axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            ListOfTriedValues.append(i)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            AddedValue = ListOfTriedValues[index]
        CreatedFeatureList.append(AddedValue)
        
        AICEvolutionList.append(AICList[index])
        #df2X = df2X.drop(AddedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'CreatedFeatures': np.array(CreatedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [26]:
#Define a method to search for the lowest ideal AIC through patterns of feature addition/substraction:
def FindLowestAICNonLogBackward(df, dependent):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = est.fit().aic
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = list(df2X.columns)
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        HeadsTails = np.random.randint(2)
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 250:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF  
    

In [27]:
AmesManualBackwardAIC = FindLowestAICNonLogBackward(AmesDummiesOrdinal, 'SalePrice')

FireplaceQu removed: New AIC = 27017.386978578845
SaleCondition_Alloca removed: New AIC = 27016.16462749133
Utilities removed: New AIC = 27015.897022335346
LotShape removed: New AIC = 27013.90848003075
HouseStyle_2.5Unf removed: New AIC = 27013.271110613707
LowQualFinSF removed: New AIC = 27011.377547794367
RoofStyle_Flat removed: New AIC = 27010.697889884963
HouseStyle_1.5Unf removed: New AIC = 27008.711732237454
RoofMatl_Tar&Grv removed: New AIC = 27006.72032626987
Exterior_Plywood removed: New AIC = 27006.117792153946
MSZoning_RM removed: New AIC = 27004.188007009903
Heating_GasW removed: New AIC = 27002.24615634221
MSZoning_RH removed: New AIC = 27000.641331188246
GarageType_Basment removed: New AIC = 26999.25458881549
MiscVal removed: New AIC = 26997.919425613465
Neighborhood_ClearCr removed: New AIC = 26995.91942874648
Utilities added: New AIC = 26995.901048343632
Heating_Grav removed: New AIC = 26994.22674274812
LotFrontage removed: New AIC = 26992.49868233201
Neighborhood_Edwar

In [37]:
AmesManualBackwardAIC[0].to_csv('AmesManualAICBackward.csv')

In [29]:
AmesManualBackwardAIC2 = FindLowestAICNonLogBackward(AmesDummiesOrdinal, 'SalePrice')

LowQualFinSF removed: New AIC = 27017.478168067893
LandSlope removed: New AIC = 27015.48462509706
Fence_GdWo removed: New AIC = 27014.462093945967
LandContour_Bnk removed: New AIC = 27012.93351944234
SaleCondition_Alloca removed: New AIC = 27011.87831169054
Neighborhood_Edwards removed: New AIC = 27009.878994344814
Neighborhood_MeadowV removed: New AIC = 27009.190270246003
LotConfig_Corner removed: New AIC = 27007.444195700555
MoSold_Winter removed: New AIC = 27005.616571633273
PavedDrive removed: New AIC = 27003.817565248184
Alley_Grvl removed: New AIC = 27001.992548565348
Alley_Pave removed: New AIC = 27000.095706603857
MasVnrType_BrkCmn removed: New AIC = 26998.90901065124
GarageType_Basment removed: New AIC = 26996.965056737914
RoofStyle_Gambrel removed: New AIC = 26995.613591987094
HouseStyle_1.5Unf removed: New AIC = 26993.63852671917
LotFrontage removed: New AIC = 26991.778933848753
Heating_Grav removed: New AIC = 26990.087130837117
RoofMatl_Tar&Grv removed: New AIC = 26989.6215

In [30]:
AmesManualBackwardAIC2[0].to_csv('AmesManualAICBackward2.csv')

In [31]:
AmesManualBackwardAIC3 = FindLowestAICNonLogBackward(AmesDummiesOrdinal, 'SalePrice')

WoodDeckSF removed: New AIC = 27018.290796702215
MSZoning_RM removed: New AIC = 27016.314893273833
GarageType_CarPort removed: New AIC = 27015.303666214564
LandContour_Low removed: New AIC = 27014.816753798335
MiscVal removed: New AIC = 27013.324876001392
Fence_GdWo removed: New AIC = 27012.561537660415
MSZoning_FV removed: New AIC = 27012.32465566271
MSZoning_RH removed: New AIC = 27010.576311986202
LotConfig_Corner removed: New AIC = 27008.70311483237
Neighborhood_Veenker removed: New AIC = 27008.376824721337
LotConfig_FR2 removed: New AIC = 27006.652147233424
Exterior_HdBoard removed: New AIC = 27004.73570849426
Heating_GasW removed: New AIC = 27002.816210815716
LandContour_Bnk removed: New AIC = 27001.514234811923
Exterior_Plywood removed: New AIC = 27001.026475369825
MasVnrType_BrkCmn removed: New AIC = 27000.366519846448
RoofStyle_Mansard removed: New AIC = 26999.3928267367
Foundation_BrkTil removed: New AIC = 26998.701348141123
Neighborhood_SawyerW removed: New AIC = 26996.74454

In [32]:
AmesManualBackwardAIC3[0].to_csv('AmesManualAICBackward3.csv')

In [33]:
FeatureIntersect = set(AmesManualBackwardAIC[0].columns) & set(AmesManualBackwardAIC2[0].columns) & set(AmesManualBackwardAIC3[0].columns)
FeatureIntersect = list(FeatureIntersect)
print(len(FeatureIntersect))
FeatureIntersect

62


['Neighborhood_Blmngtn',
 'BldgType_Duplex',
 'GarageType_No',
 'MoSold_Spring',
 'Neighborhood_NPkVill',
 'Neighborhood_Crawfor',
 'ExterCond',
 'BsmtCond',
 'BedroomAbvGr',
 'HouseStyle_SLvl',
 'BldgType_2fmCon',
 'Fence_GdPrv',
 'HouseStyle_1.5Fin',
 'Neighborhood_NridgHt',
 'PoolQC',
 'LotArea',
 'MasVnrArea',
 'OverallQual',
 'ScreenPorch',
 'Exterior_BrkFace',
 'RoofStyle_Hip',
 'Exterior_MetalSd',
 'BsmtExposure',
 'GarageType_Detchd',
 'HouseStyle_2Story',
 'MasVnrType_Stone',
 'GarageScore',
 'Condition_Feedr',
 'TotalBath',
 'Neighborhood_NoRidge',
 'BsmtScore',
 'Functional',
 'TotalSF',
 'Neighborhood_Sawyer',
 'Exterior_CemntBd',
 'RoofMatl_WdShngl',
 'SaleType_New',
 'Neighborhood_BrDale',
 'KitchenQual',
 'YearsAgoBuilt',
 'OverallCond',
 'TotalBsmtSF',
 'Street_Grvl',
 'Condition_Artery',
 'Neighborhood_NWAmes',
 'SaleCondition_Abnorml',
 'BldgType_TwnhsE',
 'BldgType_Twnhs',
 'Neighborhood_StoneBr',
 'Exterior_ImStucc',
 'Neighborhood_BrkSide',
 'MSZoning_FV',
 'MasVnr

In [34]:
manFeatList = ['BldgType_2fmCon',
 'Exterior_Stucco',
 'TotalBath',
 'PoolQC',
 'MasVnrArea',
 'OverallCond',
 'Exterior_BrkFace',
 'Foundation_Slab',
 'BedroomAbvGr',
 'RoofMatl_WdShngl',
 'BsmtScore',
 'BldgType_Duplex',
 'HouseStyle_1.5Fin',
 'HouseStyle_SLvl',
 'Neighborhood_NridgHt',
 'GarageType_BuiltIn',
 'GarageType_No',
 'GarageScore',
 'Neighborhood_StoneBr',
 'Neighborhood_NWAmes',
 'BsmtExposure',
 'Neighborhood_Sawyer',
 'LotArea',
 'ExterCond',
 'Street_Grvl',
 'Neighborhood_BrDale',
 'Exterior_CemntBd',
 'Neighborhood_Blmngtn',
 'LotConfig_CulDSac',
 'BldgType_Twnhs',
 'GarageArea',
 'MoSold_Spring',
 'TotalSF',
 'GarageType_Detchd',
 'MSZoning_FV',
 'OverallQual',
 'KitchenQual',
 'ScreenPorch',
 'RoofStyle_Hip',
 'BsmtCond',
 'Exterior_MetalSd',
 'MasVnrType_BrkFace',
 'Neighborhood_NPkVill',
 'Functional',
 'BldgType_TwnhsE',
 'SalePrice',
 'Exterior_ImStucc',
 'Fence_GdPrv',
 'Neighborhood_BrkSide',
 'SaleType_New',
 'YearsAgoBuilt',
 'HouseStyle_2Story',
 'Neighborhood_Crawfor',
 'TotRmsAbvGrd',
 'TotalBsmtSF',
 'Condition_Artery',
 'Condition_Feedr',
 'Neighborhood_NoRidge',
 'MasVnrType_Stone',
 'SaleCondition_Abnorml',
 'BsmtQual',
 'HouseStyle_SFoyer']

In [35]:
#Create final DF with only these 62 features:
AmesOrdinalManualAICFinal = AmesDummiesOrdinal[FeatureIntersect]
AmesOrdinalManualAICFinal.to_csv('AmesOrdinalManualAICFinal.csv')

In [36]:
#Define a method to search for the lowest ideal AIC through patterns of feature addition/substraction:
def FindLowestAICLogForward(df, dependent, topchoice):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = np.log(df2[dependent])
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = 1000000
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = [topchoice]
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 250:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF  

In [62]:
#This could be a promising method to use, with an R^2 of still 0.88 despite eliminating more than half the features
AmesDummiesForwardAIC = pd.concat([AmesDummiesMultiReduction[list(AmesDummiesForwardAICList['CreatedFeatures'])],
                                              AmesDummiesMultiReduction['SalePrice']], axis=1)

AmesDummiesOrdinalX = AmesDummiesForwardAIC.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesForwardAIC['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.916
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     187.3
Date:                Wed, 14 Nov 2018   Prob (F-statistic):               0.00
Time:                        17:29:24   Log-Likelihood:                -13403.
No. Observations:                1166   AIC:                         2.694e+04
Df Residuals:                    1101   BIC:                         2.727e+04
Df Model:                          64                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.065e+

## Would our selected features differ drastically if we selected using Backwards selection?  If so, we should take any differences into account

In [63]:
#Define method to remove features based on those that most increase AIC:
def TrimFeatureListByAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    StartingFeatureList = list(df2X.columns)
    RemovedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        X2 = sm.add_constant(df2X)
        est = sm.OLS(df2Y, X2)
        est2 = est.fit()
        AICBase = est2.aic
        AICList = []
        
        for i in StartingFeatureList:
            tempDFX = df2X.drop(labels=i, axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            RemovedValue = StartingFeatureList[index]
        RemovedFeatureList.append(RemovedValue)
        AICEvolutionList.append(AICList[index])
        df2X = df2X.drop(RemovedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'RemovedFeatures': np.array(RemovedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [64]:
AmesDummiesBackwardAICList = TrimFeatureListByAIC(AmesDummiesMultiReduction, 'SalePrice')

In [65]:
AmesDummiesBackwardAICList

Unnamed: 0,RemovedFeatures,NewFScore
0,Neighborhood_Edwards,27049.819357
1,Heating_GasW,27047.822050
2,MSSubClass_160,27045.826337
3,GarageFinish,27043.831954
4,SaleType_ConLw,27041.839705
5,Neighborhood_Mitchel,27039.847851
6,Alley_Grvl,27037.858082
7,Neighborhood_ClearCr,27035.869946
8,MSZoning_RM,27033.884151
9,MoSold_Winter,27031.899149
