In [1]:
import pandas as pd
import numpy as np
import math

In [29]:
#Import the CSV to make DF, count all NA values in the DF
amesdf1 = pd.read_csv('train.csv', index_col=0)
amesdf1cols = np.array(amesdf1.columns)
amesdf1nas = []
for i in range(len(amesdf1cols)):
    amesdf1nas.append(amesdf1[amesdf1cols[i]].isna().sum())
amesdf1summarydf = pd.DataFrame({'varnames': amesdf1cols, 'nas': amesdf1nas})

In [30]:
#Print the DF in descending order of number of NA values
amesdf1summarydf.sort_values(by='nas', ascending=False)

Unnamed: 0,varnames,nas
71,PoolQC,1453
73,MiscFeature,1406
5,Alley,1369
72,Fence,1179
56,FireplaceQu,690
2,LotFrontage,259
58,GarageYrBlt,81
63,GarageCond,81
57,GarageType,81
59,GarageFinish,81


In [31]:
#For most columns (as seen from the documentation), NA does not mean NA but that the house does not have that feature
#These aren't actually NAs, as they contain information
#We can replace NA with the word 'No', to indicate the feature doesn't exist for this house
#Define function to do this for multiple column names
def ConvertNaToNone(df, varlist):
    amescopy = df.copy()
    for var in varlist:
        series = amesdf1[var]
        series[series.isna()] = 'No'
        amescopy[var] = series
    return amescopy   

In [32]:
#Apply this function with our list of features containing NA
amesNoNAs = ConvertNaToNone(amesdf1, ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageYrBlt', 'GarageCond', 'BsmtExposure', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'BsmtCond', 'BsmtQual', 'BsmtFinType1', 'MasVnrType'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [33]:
#Now, count NAs for the remaining features as we did above
amesNoNAcols = np.array(amesNoNAs.columns)
amesNoNAnas = []
for i in range(len(amesNoNAcols)):
    amesNoNAnas.append(amesNoNAs[amesNoNAcols[i]].isna().sum())
amesNoNAsummarydf = pd.DataFrame({'varnames': amesNoNAcols, 'nas': amesNoNAnas})
amesNoNAsummarydf.sort_values(by='nas', ascending=False)

Unnamed: 0,varnames,nas
2,LotFrontage,259
25,MasVnrArea,8
41,Electrical,1
51,KitchenAbvGr,0
58,GarageYrBlt,0
57,GarageType,0
56,FireplaceQu,0
55,Fireplaces,0
54,Functional,0
53,TotRmsAbvGrd,0


In [34]:
#For the one missing 'Electrical' entry, replace with the most common category
print(amesNoNAs['Electrical'].value_counts())
amesNoNAs.loc[amesNoNAs['Electrical'].isna(),'Electrical'] = 'SBrkr'
print(amesNoNAs['Electrical'].value_counts())

#For the two numerical entries (LotFrontage, MasVnrArea), replace with random sample of the non-NA values
amesNoNAs.loc[amesNoNAs['LotFrontage'].isna(), 'LotFrontage'] = np.random.choice(amesNoNAs['LotFrontage'].dropna(), size=259, replace=True)
amesNoNAs.loc[amesNoNAs['MasVnrArea'].isna(), 'MasVnrArea'] = np.random.choice(amesNoNAs['MasVnrArea'].dropna(), size=8, replace=True)

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
SBrkr    1335
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64


In [35]:
#Sum all the row of the Series counting NA's in each column (total NAs in dataframe) to show all NA's are gone
np.sum(np.sum(amesNoNAs.isna()))

0

In [26]:
PureCategorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandCountour', 'LotConfig', 'LandSlope', 'Neighborhood',
                  'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Foundation', 'MasVnrType', 'CentralAir', 'Electrical',
                  'GarageType','MiscFeature', 'SaleType','SaleCondition']
QuesCategorial = ['LotShape', 'Utilities', ]
OrdCategorical = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinished', 'GarageQual', 'GarageCond',
                 'PavedDrive', 'PoolQC','Fence']
JointFeatures = ['Condition1', 'Condition2', 'Exterior1st', 'Exterior2nd']
Numerical = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF', 'BsmtScore', 'TotalBsmtSF', 'TotalSF',
            'PctLowQual', 'AboveGroundPct', 'TotalBath', 'Bedroom', 'Kitchen', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBuilt',
            'GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
            'MiscVal', 'DateSold']

In [None]:
#NOTES
'''BsmtFinType1 and BsmtFinType2. Super problematic. These are if the person has 2 types of finished basement. 
Its partly ordinal and then contains corresponding SQFT values. I would propose combining these 2 numbers (give a 
score for the different finish types, then weight the average. And give 'No Basement' a score of zero for both metrics).
This will convert 5 total features (BsmtFinSF1, BsmtFinSF2, BsmtFinType1, BsmtFinType2, BsmtUnfSF into 1 (BsmtScore))'''

'''FullBath, HalfBath, BsmtFullBath, BsmtHalfBath could realistically be converted to TotalBath, removing 3 features'''

'''1stFlrSF and 2ndFltSF could realistically be converted to TotalSF'''

'''LowQualFinSF/TotalSF converted to PctLowQual and remove LowQualFinSF? Same for GrLivArea/TotalSF=AboveGroundSF?'''

'''Misc Feature should really be able to be removed since we are provided with MiscVal, the dollar value of misc features'''

'''Row index 333 is a little off. It has a basement, but not 2 feature types (its all unfinished). It values should be 
Type1=Unf, Type1SF=#, Type2=Unf, Type2SF=0. But it went Type2=NA, Type2SF=0, meaning one value must be fixed because 
the NA turned to No, which is incorrect'''


In [36]:
#Fix index 333, BsmtFinType2 to 'Unf' as it should be
amesNoNAs.loc[333,'BsmtFinType2']='Unf'

In [40]:
'''Weighted index score to determine overall basement 'finishedness'. 'BsmtScore'. Will be 0 if no basement, 
1 if totally unfinished, 6 is completely finished at full living quarter quality'''

amesNoNAs['BsmtFinType1'] = amesNoNAs['BsmtFinType1'].astype('str').copy()
amesNoNAs['BsmtFinType1'] = pd.to_numeric(amesNoNAs['BsmtFinType1'].replace(['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'No'], 
                                                              [6,5,4,3,2,1,0]))

amesNoNAs['BsmtFinType2'] = amesNoNAs['BsmtFinType2'].astype('str').copy()
amesNoNAs['BsmtFinType2'] = pd.to_numeric(amesNoNAs['BsmtFinType2'].replace(['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'No'], 
                                                              [6,5,4,3,2,1,0]))
amesNoNAs[['BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF']].dtypes

amesNoNAs['BsmtFinSF1'] =  amesNoNAs['BsmtFinSF1'].replace(0, 1)
amesNoNAs['BsmtFinSF2'] =  amesNoNAs['BsmtFinSF2'].replace(0, 1)
amesNoNAs['TotalBsmtSFTemp'] =  amesNoNAs['TotalBsmtSF'].replace(0, 1)

amesNoNAs['BsmtScore'] = (amesNoNAs['BsmtFinType1'].values*amesNoNAs['BsmtFinSF1'].values + 
                       amesNoNAs['BsmtFinType2'].values*amesNoNAs['BsmtFinSF2'].values) / (amesNoNAs['BsmtFinSF1'].values + amesNoNAs['BsmtFinSF2'].values)

In [41]:
#Check everything is between 0 and 6
amesNoNAs['BsmtScore'].describe()

count    1460.000000
mean        3.527698
std         2.072100
min         0.000000
25%         1.000000
50%         3.992491
75%         5.854515
max         5.999114
Name: BsmtScore, dtype: float64

In [43]:
#Remove BsmtFin1,2 BsmtFinSF1,2, BsmtUnfSF, TotalBsmtSFTemp
amesNoNAs = amesNoNAs.drop(['BsmtFinType1', 'BsmtFinType2', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSFTemp'], axis=1)

In [45]:
#Make new column, TotalSF, and remove 1stFlrSF and 2ndFlrSF
amesNoNAs['TotalSF'] = amesNoNAs['1stFlrSF'].values + amesNoNAs['2ndFlrSF'].values
amesNoNAs = amesNoNAs.drop(['1stFlrSF', '2ndFlrSF'], axis=1)

KeyError: '1stFlrSF'

In [48]:
#Check that 5 columns have been removed
print(amesdf1.shape)
print(amesNoNAs.shape)

(1460, 80)
(1460, 75)


In [49]:
#Convert FullBath, HalfBath, BsmtFullBath, BsmtHalfBath into TotalBath
amesNoNAs['TotalBath'] = amesNoNAs['FullBath'].values + amesNoNAs['BsmtFullBath'].values + 0.5*amesNoNAs['HalfBath'].values + 0.5*amesNoNAs['BsmtHalfBath'].values
amesNoNAs = amesNoNAs.drop(['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath'], axis=1)
print(amesNoNAs.shape)

(1460, 72)


In [50]:
#Create FractionLowQual, FractionAboveGround, remove the source columns
amesNoNAs['PropLowQualSF'] = amesNoNAs['LowQualFinSF'].values/amesNoNAs['TotalSF'].values
amesNoNAs['PropAboveGroundSF'] = amesNoNAs['GrLivArea'].values/amesNoNAs['TotalSF'].values
amesNoNAs = amesNoNAs.drop(['LowQualFinSF','GrLivArea'], axis=1)

In [51]:
#Remove MiscFeature
amesNoNAs = amesNoNAs.drop('MiscFeature', axis=1)
print(amesNoNAs.shape)

(1460, 71)
