In [1]:
#Import packages
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
lm = LinearRegression()

import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

#Load our two Data Frames
AmesDummiesOrdinal = pd.read_csv('AmesOrdinalManualFeatures.csv')

In [2]:
np.random.seed(19)
testIdxes = np.random.choice(range(1458), size= 292, replace=False)
trainIdxes = list(set(range(1458))-set(testIdxes))

In [3]:
#Take out the train dataset from the overall AmesDummiesOrdinalDF:
AmesDummiesOrdinal = AmesDummiesOrdinal.iloc[0:1458,]

## First, let's remove obvious features that will not affect our analysis

In [11]:
#Let's try the most basic linear regression, just to get a sense of what the results look like:
#THIS IS WITH THE WHOLE DATA SET -- WE WILL CUT TO OUR TRAIN SET NEXT
AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     95.24
Date:                Fri, 16 Nov 2018   Prob (F-statistic):               0.00
Time:                        16:33:50   Log-Likelihood:                -16673.
No. Observations:                1458   AIC:                         3.366e+04
Df Residuals:                    1299   BIC:                         3.451e+04
Df Model:                         158                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.856e+

In [4]:
#Let's get rid of "Id" (the index from the Processing DF), and any features with 5 or fewer observations
AmesDummiesOrdinal = AmesDummiesOrdinal.drop('Id', axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     96.02
Date:                Sat, 17 Nov 2018   Prob (F-statistic):               0.00
Time:                        12:55:58   Log-Likelihood:                -16673.
No. Observations:                1458   AIC:                         3.366e+04
Df Residuals:                    1300   BIC:                         3.450e+04
Df Model:                         157                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.877e+

In [6]:
#Now, to Feature Select, let's trim our AmesDummiesOrdinal to the Train subset:
AmesDummiesOrdinalAll = AmesDummiesOrdinal.iloc[trainIdxes,]
AmesDummiesOrdinal = AmesDummiesOrdinalAll.copy()

In [7]:
#Start Trimming Values from DFSelect:
np.sum(AmesDummiesOrdinal).sort_values()

Exterior_Other           0.000000e+00
Electrical_Mix           0.000000e+00
Exterior_CBlock          0.000000e+00
RoofMatl_Metal           0.000000e+00
RoofMatl_ClyTile         0.000000e+00
Neighborhood_Blueste     1.000000e+00
RoofMatl_Roll            1.000000e+00
RoofMatl_Membran         1.000000e+00
Heating_Floor            1.000000e+00
SaleCondition_AdjLand    2.000000e+00
Heating_OthW             2.000000e+00
Heating_Wall             2.000000e+00
RoofStyle_Shed           2.000000e+00
Foundation_Wood          3.000000e+00
LotConfig_FR3            3.000000e+00
Electrical_FuseP         3.000000e+00
Exterior_AsphShn         3.000000e+00
SaleType_Oth             3.000000e+00
SaleType_CWD             3.000000e+00
Street_Grvl              4.000000e+00
Exterior_Stone           4.000000e+00
GarageType_2Types        5.000000e+00
RoofMatl_WdShake         5.000000e+00
Foundation_Stone         5.000000e+00
Heating_Grav             5.000000e+00
Neighborhood_NPkVill     6.000000e+00
RoofMatl_WdS

In [8]:
#Remove 19 lowest features with three or fewer entries:
#No change at all from our initial R^2 score(0.921)
AmesDummiesOrdinal = AmesDummiesOrdinal.drop(list(np.sum(AmesDummiesOrdinal).sort_values()[0:19].index), axis=1)

AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.921
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     86.28
Date:                Sat, 17 Nov 2018   Prob (F-statistic):               0.00
Time:                        12:56:20   Log-Likelihood:                -13365.
No. Observations:                1166   AIC:                         2.701e+04
Df Residuals:                    1026   BIC:                         2.772e+04
Df Model:                         139                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.837e+

In [9]:
#At this point, we have 140 possible features for the trimmed dataset, 159 features for the All dataset.
print(AmesDummiesOrdinal.shape)
print(AmesDummiesOrdinalAll.shape)

(1166, 140)
(1166, 159)


## Now, let's look for features with high multicollinearity and address them. We could manually do this by removing features with high VIF one at a time, or could manually search through the correlation of each variable with others first. Let's compare the two methods

In [18]:
#This is an imported function, found online, to check VIF for a given DF and remove features with too high a VIF:

def calculate_vif_(X, thresh=10):
    X2 = X.copy()
    cols = X2.columns
    variables = np.arange(X2.shape[1])
    dropped=True
    while dropped:
        dropped=False
        c = X2[cols[variables]].values
        vif = [variance_inflation_factor(c, ix) for ix in np.arange(c.shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X2[cols[variables]].columns[maxloc] + '\' at index: ' + str(maxloc) + " -- VIF: " + str(max(vif)))
            variables = np.delete(variables, maxloc)
            dropped=True

    print('Remaining variables:')
    print(X2.columns[variables])

    return X2[cols[variables]]

In [19]:
#Create an AmesDummies DF with VIF's under 10, removing one at a time, using the above function:
AmesDummiesVIFUnder10 = calculate_vif_(AmesDummiesOrdinal.drop('SalePrice', axis=1), thresh=10)

dropping 'Utilities' at index: 3 -- VIF: 799.7053097271129
dropping 'Functional' at index: 19 -- VIF: 160.1284214210369
dropping 'GarageScore' at index: 30 -- VIF: 130.28664246884142
dropping 'ExterQual' at index: 7 -- VIF: 125.38258316439152
dropping 'TotRmsAbvGrd' at index: 17 -- VIF: 109.6888094010334
dropping 'ExterCond' at index: 7 -- VIF: 101.77547528075166
dropping 'BsmtCond' at index: 8 -- VIF: 97.36602905721142
dropping 'OverallQual' at index: 4 -- VIF: 93.35180797338427
dropping 'KitchenAbvGr' at index: 12 -- VIF: 80.01699691372804
dropping 'TotalSF' at index: 25 -- VIF: 79.85615813636402
dropping 'KitchenQual' at index: 12 -- VIF: 68.48306696448792
dropping 'BsmtQual' at index: 6 -- VIF: 65.6763001779422
dropping 'SaleCondition_Partial' at index: 102 -- VIF: 58.57894640250374
dropping 'OverallCond' at index: 4 -- VIF: 42.764085767112334
dropping 'HeatingQC' at index: 7 -- VIF: 33.007785662779305
dropping 'LandSlope' at index: 3 -- VIF: 31.67214094468473
dropping 'TotalBath' 

In [12]:
'''This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,
size of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to 
discriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this 
manually.'''

"This method removed a number of variables that one would think would be important for our model -- age of house, TotalSF,\nsize of garage, number of rooms/baths, basement and garage quality, etc. It may not know which of two similar features to \ndiscriminate on, and be choosing more obscure ones in favor of clearer and more descriptive variables. Let's try doing this \nmanually."

In [20]:
#Devise a function to produce a correlation matrix for our feature DF, then go manually
def CreateCorrelationMatrix(df, dependent):
    df2 = df.drop(dependent, axis=1)
    for i in range(len(df2.columns)):
        corrarray = []
        indexarray = []
        for j in range(len(df2.columns)):
            corr12 = df2[df2.columns[i]].corr(df2[df2.columns[j]])
            corrarray.append(corr12)
            indexarray.append(df2.columns[j])
        seriesi = pd.Series(corrarray, index=indexarray)
        
        if i > 0:
            corrDF = pd.concat([corrDF, seriesi], axis=1)
        
        else:
            corrDF = pd.DataFrame(seriesi)
     
    #Rename the columns to be the same as the indices (a self matrix)
    corrDF.columns = corrDF.index
    
    #reset all self-covariances to 0
    for var in corrDF.columns:
        corrDF.loc[var, var] = 0
    
    return corrDF   

In [21]:
#Create the overall correlation matrix, first with all features:
corrDF0 = CreateCorrelationMatrix(AmesDummiesOrdinal, 'SalePrice')

In [22]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['SaleCondition_Partial'].sort_values(ascending=False) #ConditionPartial and SaleTtpe_New are identical, remove PArtial

SaleCondition_Partial    0.989498
SaleType_New             0.989498
PoolQC                   0.865471
PoolArea                 0.865471
FireplaceQu              0.860687
Fireplaces               0.860687
MSZoning_FV              0.856975
Neighborhood_Somerst     0.856975
RoofMatl_Tar&Grv         0.856280
RoofStyle_Flat           0.856280
TotalSF                  0.825145
TotRmsAbvGrd             0.825145
KitchenAbvGr             0.728572
BldgType_Duplex          0.728572
ExterQual                0.722220
OverallQual              0.722220
KitchenQual              0.715134
BedroomAbvGr             0.685216
Exterior_BrkComm         0.664943
Neighborhood_NPkVill     0.664943
GarageScore              0.639730
GarageArea               0.639730
HouseStyle_2.5Fin        0.632350
LowQualFinSF             0.632350
BsmtQual                 0.630755
BsmtCond                 0.629026
TotalBath                0.605381
YearsSinceRemodel        0.595028
YearsAgoBuilt            0.595028
TotalBsmtSF   

In [23]:
#Remove MSSubClass_90, rerun the analysis, and create a list of removed features
RemovedFeatures = ['SaleCondition_Partial']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [24]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['PoolQC'].sort_values(ascending=False) #PoolQC is highly correlated with area. Remove one

PoolQC                  0.865471
PoolArea                0.865471
FireplaceQu             0.860687
Fireplaces              0.860687
Neighborhood_Somerst    0.856975
MSZoning_FV             0.856975
RoofMatl_Tar&Grv        0.856280
RoofStyle_Flat          0.856280
TotalSF                 0.825145
TotRmsAbvGrd            0.825145
KitchenAbvGr            0.728572
BldgType_Duplex         0.728572
OverallQual             0.722220
ExterQual               0.722220
KitchenQual             0.715134
BedroomAbvGr            0.685216
Neighborhood_NPkVill    0.664943
Exterior_BrkComm        0.664943
GarageScore             0.639730
GarageArea              0.639730
LowQualFinSF            0.632350
HouseStyle_2.5Fin       0.632350
BsmtQual                0.630755
BsmtCond                0.629026
TotalBath               0.605381
YearsSinceRemodel       0.595028
YearsAgoBuilt           0.595028
TotalBsmtSF             0.581484
MasVnrType_BrkFace      0.574884
MasVnrArea              0.574884
          

In [25]:
#Remove SaleCondition_Partial, rerun the analysis, and create a list of removed features
RemovedFeatures = ['SaleCondition_Partial', 'PoolArea']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [26]:
#Check the matrix out, eliminate a variable then rerun the process:
np.max(corrDF0).sort_values(ascending=False)
corrDF0['Fireplaces'].sort_values(ascending=False) #Fireplaces are highly correlated with FireplaceQu. Remove fireplaces

Fireplaces              0.860687
FireplaceQu             0.860687
Neighborhood_Somerst    0.856975
MSZoning_FV             0.856975
RoofMatl_Tar&Grv        0.856280
RoofStyle_Flat          0.856280
TotalSF                 0.825145
TotRmsAbvGrd            0.825145
KitchenAbvGr            0.728572
BldgType_Duplex         0.728572
ExterQual               0.722220
OverallQual             0.722220
KitchenQual             0.715134
BedroomAbvGr            0.685216
Exterior_BrkComm        0.664943
Neighborhood_NPkVill    0.664943
GarageScore             0.639730
GarageArea              0.639730
HouseStyle_2.5Fin       0.632350
LowQualFinSF            0.632350
BsmtQual                0.630755
BsmtCond                0.629026
TotalBath               0.605381
YearsSinceRemodel       0.595028
YearsAgoBuilt           0.595028
TotalBsmtSF             0.581484
MasVnrType_BrkFace      0.574884
MasVnrArea              0.574884
Neighborhood_OldTown    0.564227
MSZoning_RM             0.564227
          

In [27]:
#Remove MSSubClass_190, rerun the analysis, and create a list of removed features
RemovedFeatures = ['SaleCondition_Partial', 'PoolArea', 'Fireplaces']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
corrDF0 = CreateCorrelationMatrix(AmesDummiesMultiReduction, 'SalePrice')

In [11]:
#Check the matrix out, eliminate a variable then rerun the process:
#np.max(corrDF0).sort_values(ascending=False)
#One other cause for concern is Kitchen Abv Gr, which looks like, and should, highly correlate with duplexes"
RemovedFeatures = ['SaleCondition_Partial', 'PoolArea', 'Fireplaces', 'KitchenAbvGr']
AmesDummiesMultiReduction = AmesDummiesOrdinal.drop(RemovedFeatures, axis=1)
AmesDummiesOrdinal = AmesDummiesMultiReduction.copy()

In [None]:
AmesDummiesOrdinal.to_csv(AmesOrdinalManual)

In [31]:
print(AmesDummiesOrdinal.shape)
print(AmesDummiesOrdinalAll.shape)

(1166, 136)
(1166, 159)


In [32]:
'''After removing obvious features, our DF went from 159 to 136 feature columns. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques
to decide which ones contribute meaningfully to our model'''

'After removing obvious features, our DF went from 159 to 136 feature columns. These remaining columns can bow be subjected to either Backward or Forward feature selection techniques\nto decide which ones contribute meaningfully to our model'

In [33]:
#We see that the DF we have created, eliminating >40 sparse and co-linear features, still retains a high R^2 (0.923 to 0.920) and lowered AIC (33670 to 27040)
#Basically no R^2 has been lost, even though we've decreased AIC significantly
AmesDummiesOrdinalX = AmesDummiesOrdinal.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesOrdinal['SalePrice']

X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.910
Method:                 Least Squares   F-statistic:                     87.96
Date:                Fri, 16 Nov 2018   Prob (F-statistic):               0.00
Time:                        16:45:42   Log-Likelihood:                -13373.
No. Observations:                1166   AIC:                         2.702e+04
Df Residuals:                    1030   BIC:                         2.771e+04
Df Model:                         135                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.985e+

## Now, we need to select features to use in our analysis. Let's first try a method of Forward selection, where we add features that add the most to AIC, until we have no more features that will add value.

In [34]:
#Define method to add features one at a time based on which subtract the most from AIC:
def AddFeatureListbyAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    ListofPossibleFeatures = list(df2X.columns)
    StartingFeatureList = []
    CreatedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        ListOfTriedValues = []
        if len(CreatedFeatureList) > 0:
            X2 = sm.add_constant(df2X[CreatedFeatureList])
            est = sm.OLS(df2Y, X2)
            est2 = est.fit()
            AICBase = est2.aic
            AICList = []
        else:
            AICBase = 1000000
            AICList = []
        
        for i in ListofPossibleFeatures:
            if i in CreatedFeatureList:
                continue
            tempDFX = df2X[CreatedFeatureList]
            tempDFX = pd.concat([tempDFX, df2X[[i]]], axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            ListOfTriedValues.append(i)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            AddedValue = ListOfTriedValues[index]
        CreatedFeatureList.append(AddedValue)
        
        AICEvolutionList.append(AICList[index])
        #df2X = df2X.drop(AddedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'CreatedFeatures': np.array(CreatedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [35]:
#Define a method to search for the lowest ideal AIC through patterns of feature addition/substraction:
def FindLowestAICNonLogBackward(df, dependent):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = est.fit().aic
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = list(df2X.columns)
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        HeadsTails = np.random.randint(2)
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 250:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF  
    

In [37]:
AmesManualBackwardAIC = FindLowestAICNonLogBackward(AmesDummiesOrdinal, 'SalePrice')

FireplaceQu removed: New AIC = 27015.55527625765
SaleCondition_Alloca removed: New AIC = 27014.351172351293
Utilities removed: New AIC = 27014.069392816233
LotShape removed: New AIC = 27012.07603019577
HouseStyle_2.5Unf removed: New AIC = 27011.314930103617
LowQualFinSF removed: New AIC = 27009.38955371237
RoofStyle_Flat removed: New AIC = 27008.6823854167
HouseStyle_1.5Unf removed: New AIC = 27006.685284912724
RoofMatl_Tar&Grv removed: New AIC = 27004.725240658983
Exterior_WdShing removed: New AIC = 27003.285231827227
MSZoning_RM removed: New AIC = 27001.333791181893
Heating_GasW removed: New AIC = 26999.398448509946
RoofMatl_WdShake removed: New AIC = 26999.13610738428
MSZoning_RH removed: New AIC = 26997.488145881453
GarageType_Basment removed: New AIC = 26995.976919833844
MiscVal removed: New AIC = 26994.685727766482
Neighborhood_ClearCr removed: New AIC = 26992.69177420568
HouseStyle_2.5Fin removed: New AIC = 26992.67601300353
Heating_Grav removed: New AIC = 26990.959015518063
Ext

In [38]:
AmesManualBackwardAIC[0].to_csv('AmesManualAICBackward.csv')

In [40]:
AmesManualBackwardAIC2 = FindLowestAICNonLogBackward(AmesDummiesOrdinal, 'SalePrice')

LotConfig_FR2 removed: New AIC = 27015.814730809303
HouseStyle_1.5Unf removed: New AIC = 27013.826295344094
SaleType_COD removed: New AIC = 27012.294705844637
MSZoning_C removed: New AIC = 27010.29568121836
Neighborhood_Mitchel removed: New AIC = 27008.39056177152
Exterior_Wd Sdng removed: New AIC = 27006.76361679862
PavedDrive removed: New AIC = 27004.970190788987
LandContour_HLS removed: New AIC = 27003.139479095753
Neighborhood_SWISU removed: New AIC = 27002.58826213341
Neighborhood_Veenker removed: New AIC = 27002.100279886894
Foundation_Stone removed: New AIC = 27000.49568203012
Neighborhood_SawyerW removed: New AIC = 26998.57331057573
MoSold_Autumn removed: New AIC = 26996.89238619925
HeatingQC removed: New AIC = 26995.09186739753
Exterior_BrkComm removed: New AIC = 26993.145373446587
WoodDeckSF removed: New AIC = 26992.583687311802
RoofMatl_Tar&Grv removed: New AIC = 26991.97760320298
LandContour_Bnk removed: New AIC = 26991.030128673887
GarageType_2Types removed: New AIC = 2698

In [41]:
AmesManualBackwardAIC2[0].to_csv('AmesManualAICBackward2.csv')

In [42]:
AmesManualBackwardAIC3 = FindLowestAICNonLogBackward(AmesDummiesOrdinal, 'SalePrice')

Exterior_AsbShng removed: New AIC = 27015.560120454487
Fence_GdWo removed: New AIC = 27014.632565742493
WoodDeckSF removed: New AIC = 27013.916659996383
MSZoning_RM removed: New AIC = 27011.94069027121
GarageType_CarPort removed: New AIC = 27011.025057716288
LandContour_Low removed: New AIC = 27010.561231263553
MiscVal removed: New AIC = 27009.173057732907
MSZoning_FV removed: New AIC = 27008.943183250773
MSZoning_RH removed: New AIC = 27007.187386484555
Exterior_Wd Sdng removed: New AIC = 27005.655201774807
LotConfig_Corner removed: New AIC = 27003.883436942684
Neighborhood_Veenker removed: New AIC = 27003.589199290578
LotConfig_FR2 removed: New AIC = 27001.79939260597
Heating_GasW removed: New AIC = 26999.86893060104
LandContour_Bnk removed: New AIC = 26998.65491184757
Exterior_WdShing removed: New AIC = 26996.961331005197
MasVnrType_BrkCmn removed: New AIC = 26996.422834962657
RoofStyle_Mansard removed: New AIC = 26995.10673803551
Foundation_BrkTil removed: New AIC = 26994.147479575

In [43]:
AmesManualBackwardAIC3[0].to_csv('AmesManualAICBackward3.csv')

In [50]:
FeatureIntersect = set(AmesManualBackwardAIC[0].columns) & set(AmesManualBackwardAIC2[0].columns) & set(AmesManualBackwardAIC3[0].columns)
FeatureIntersect = list(FeatureIntersect)
print(len(FeatureIntersect))
FeatureIntersect

62


['BldgType_2fmCon',
 'Exterior_Stucco',
 'TotalBath',
 'PoolQC',
 'MasVnrArea',
 'OverallCond',
 'Exterior_BrkFace',
 'Foundation_Slab',
 'BedroomAbvGr',
 'RoofMatl_WdShngl',
 'BsmtScore',
 'BldgType_Duplex',
 'HouseStyle_1.5Fin',
 'HouseStyle_SLvl',
 'Neighborhood_NridgHt',
 'GarageType_BuiltIn',
 'GarageType_No',
 'GarageScore',
 'Neighborhood_StoneBr',
 'Neighborhood_NWAmes',
 'BsmtExposure',
 'Neighborhood_Sawyer',
 'LotArea',
 'ExterCond',
 'Street_Grvl',
 'Neighborhood_BrDale',
 'Exterior_CemntBd',
 'Neighborhood_Blmngtn',
 'LotConfig_CulDSac',
 'BldgType_Twnhs',
 'GarageArea',
 'MoSold_Spring',
 'TotalSF',
 'GarageType_Detchd',
 'MSZoning_FV',
 'OverallQual',
 'KitchenQual',
 'ScreenPorch',
 'RoofStyle_Hip',
 'BsmtCond',
 'Exterior_MetalSd',
 'MasVnrType_BrkFace',
 'Neighborhood_NPkVill',
 'Functional',
 'BldgType_TwnhsE',
 'SalePrice',
 'Exterior_ImStucc',
 'Fence_GdPrv',
 'Neighborhood_BrkSide',
 'SaleType_New',
 'YearsAgoBuilt',
 'HouseStyle_2Story',
 'Neighborhood_Crawfor',


In [51]:
#Create final DF with only these 62 features:
AmesOrdinalManualAICFinal = AmesDummiesOrdinal[FeatureIntersect]
AmesOrdinalManualAICFinal.to_csv('AmesOrdinalManualAICFinal.csv')

In [36]:
#Define a method to search for the lowest ideal AIC through patterns of feature addition/substraction:
def FindLowestAICLogForward(df, dependent, topchoice):
    '''Input: DF to AIC-modify and the dependent variable. WILL RETURN: A tuple: [0] is the modified DF (with dependent)
    and tuple[1] will give you the summary DF'''
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = np.log(df2[dependent])
    FeatureList = list(df2X.columns)
    X2 = sm.add_constant(df2X)
    est = sm.OLS(df2Y, X2)
    CurrentAIC = 1000000
    CanBeBetter = True
    ModList = []
    AddedSubtracted = []
    AIC = []
    TriesSinceReset = 0
    
    tempColumnList = [topchoice]
    tempDF2X = df2X[tempColumnList]
    
    while CanBeBetter == True:
        Choice = np.random.choice(list(df2X.columns))
        
        
        if Choice in tempColumnList:
            HeadsTails = 0
        if Choice not in tempColumnList:
            HeadsTails = 1
        
        if HeadsTails == 1:
            tempColumnList.append(Choice)
            tempDF2X[Choice] = df2X[Choice]
        
        if HeadsTails == 0:

            tempColumnList.remove(Choice)
            tempDF2X = tempDF2X[tempColumnList]
            
        est = sm.OLS(df2Y, sm.add_constant(tempDF2X))
        NewAIC = est.fit().aic
        
        if NewAIC < CurrentAIC:
            TriesSinceReset = 0
            CurrentAIC = NewAIC
            
            if HeadsTails == 1:
                print(Choice + " added: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Added')
                AIC.append(CurrentAIC)
                
            if HeadsTails == 0:
                print(Choice + " removed: New AIC = " + str(CurrentAIC))
                ModList.append(Choice)
                AddedSubtracted.append('Subtracted')
                AIC.append(CurrentAIC)
            continue
            
        else:
            TriesSinceReset += 1
            
            if HeadsTails == 1:
                tempColumnList.remove(Choice)
                tempDF2X = tempDF2X[tempColumnList]
                
            if HeadsTails == 0:
                tempColumnList.append(Choice)
                tempDF2X[Choice] = df2X[Choice]
                
            if TriesSinceReset > 250:
                CanBeBetter = False
                
    SummaryDF = pd.DataFrame({'Feature': ModList, 'AddOrSubtract': AddedSubtracted, 'AIC': AIC})
    NewDF = pd.concat([tempDF2X, df2[[dependent]]], axis=1)
    
    return NewDF, SummaryDF  

In [62]:
#This could be a promising method to use, with an R^2 of still 0.88 despite eliminating more than half the features
AmesDummiesForwardAIC = pd.concat([AmesDummiesMultiReduction[list(AmesDummiesForwardAICList['CreatedFeatures'])],
                                              AmesDummiesMultiReduction['SalePrice']], axis=1)

AmesDummiesOrdinalX = AmesDummiesForwardAIC.drop('SalePrice', axis=1)
AmesDummiesOrdinalY = AmesDummiesForwardAIC['SalePrice']

import statsmodels.api as sm
X = AmesDummiesOrdinalX
Y = AmesDummiesOrdinalY

X2 = sm.add_constant(X)
est = sm.OLS(Y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.916
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     187.3
Date:                Wed, 14 Nov 2018   Prob (F-statistic):               0.00
Time:                        17:29:24   Log-Likelihood:                -13403.
No. Observations:                1166   AIC:                         2.694e+04
Df Residuals:                    1101   BIC:                         2.727e+04
Df Model:                          64                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.065e+

## Would our selected features differ drastically if we selected using Backwards selection?  If so, we should take any differences into account

In [63]:
#Define method to remove features based on those that most increase AIC:
def TrimFeatureListByAIC(df, dependent):
    AnyPositive = True
    df2 = df.copy()
    df2X = df2.drop(dependent, axis=1)
    df2Y = df2[dependent]
    StartingFeatureList = list(df2X.columns)
    RemovedFeatureList = []
    AICEvolutionList = []
    
    while AnyPositive == True:
        X2 = sm.add_constant(df2X)
        est = sm.OLS(df2Y, X2)
        est2 = est.fit()
        AICBase = est2.aic
        AICList = []
        
        for i in StartingFeatureList:
            tempDFX = df2X.drop(labels=i, axis=1)
            tempX2 = sm.add_constant(tempDFX)
            est = sm.OLS(df2Y, tempX2)
            est2 = est.fit()
            AICList.append(est2.aic)
            AICListN = np.array(AICList)
            
        if any(AICListN-AICBase < 0) == False:
            break
            
        else:
            index = AICList.index(min(AICList))
            RemovedValue = StartingFeatureList[index]
        RemovedFeatureList.append(RemovedValue)
        AICEvolutionList.append(AICList[index])
        df2X = df2X.drop(RemovedValue, axis=1)
        StartingFeatureList = list(df2X.columns)
    
    resultDF = pd.DataFrame({'RemovedFeatures': np.array(RemovedFeatureList), 'NewFScore': np.array(AICEvolutionList)})
        
    return resultDF

In [64]:
AmesDummiesBackwardAICList = TrimFeatureListByAIC(AmesDummiesMultiReduction, 'SalePrice')

In [65]:
AmesDummiesBackwardAICList

Unnamed: 0,RemovedFeatures,NewFScore
0,Neighborhood_Edwards,27049.819357
1,Heating_GasW,27047.822050
2,MSSubClass_160,27045.826337
3,GarageFinish,27043.831954
4,SaleType_ConLw,27041.839705
5,Neighborhood_Mitchel,27039.847851
6,Alley_Grvl,27037.858082
7,Neighborhood_ClearCr,27035.869946
8,MSZoning_RM,27033.884151
9,MoSold_Winter,27031.899149
