In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import math
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, StackingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from regressors import stats
from scipy.stats import kurtosis, skew, boxcox
from sklearn.kernel_ridge import KernelRidge

In [2]:
train = pd.read_csv('./data/train.csv',index_col='Id')
test = pd.read_csv('./data/test.csv',index_col='Id')

In [3]:
class DataCleaner:
    
    def __init__(self, train, test):
        self.train = train.copy()
        self.test = test.copy()
        
    
    def cleanAlley(self):
        self.train.loc[:,'Alley'] = self.train['Alley'].fillna('No_Alley_Access')
        self.test.loc[:,'Alley'] = self.test['Alley'].fillna('No_Alley_Access')
        
    
    def cleanBsmt(self):
        self.cleanBsmtQual()
        self.cleanBsmtCond()
        self.cleanBsmtExposure()
        self.cleanBsmtFinType1()
        self.cleanBsmtFinType2()
        self.cleanBsmtFinSF1()
        self.cleanBsmtFinSF2()
        self.cleanBsmtUnfSF()
        self.cleanTotalBsmtSF()
        self.cleanBsmtFullBath()
        self.cleanBsmtHalfBath()
    
    def cleanBsmtQual(self):
        self.train.loc[:,'BsmtQual'] = self.train['BsmtQual'].fillna(0)
        self.test.loc[:,'BsmtQual'] = self.test['BsmtQual'].fillna(0)
        
        
    def cleanBsmtCond(self):
        self.train.loc[:,'BsmtCond'] = self.train['BsmtCond'].fillna(0)
        self.test.loc[:,'BsmtCond'] = self.test['BsmtCond'].fillna(0)
        
    
    def cleanBsmtExposure(self):
        self.train.loc[:,'BsmtExposure'] = self.train['BsmtExposure'].fillna('No_Basement')
        self.test.loc[:,'BsmtExposure'] = self.test['BsmtExposure'].fillna('No_Basement')
        
        
    def cleanBsmtFinType1(self):
        self.train.loc[:,'BsmtFinType1'] = self.train['BsmtFinType1'].fillna('No_Basement')
        self.test.loc[:,'BsmtFinType1'] = self.test['BsmtFinType1'].fillna('No_Basement')

        
    def cleanBsmtFinType2(self):
        self.train.loc[:,'BsmtFinType2'] = self.train['BsmtFinType2'].fillna('No_Basement')
        self.test.loc[:,'BsmtFinType2'] = self.test['BsmtFinType2'].fillna('No_Basement')
        
        
    def cleanBsmtFinSF1(self):

        self.train.loc[:,'BsmtFinSF1'] = self.train['BsmtFinSF1'].fillna(0)
        self.test.loc[:,'BsmtFinSF1'] = self.test['BsmtFinSF1'].fillna(0)

        
    def cleanBsmtFinSF2(self):
        self.train.loc[:,'BsmtFinSF2'] = self.train['BsmtFinSF2'].fillna(0)
        self.test.loc[:,'BsmtFinSF2'] = self.test['BsmtFinSF2'].fillna(0)
        

    def cleanBsmtUnfSF(self):
        self.train.loc[:,'BsmtUnfSF'] = self.train['BsmtUnfSF'].fillna(0)
        self.test.loc[:,'BsmtUnfSF'] = self.test['BsmtUnfSF'].fillna(0)
        
        
    def cleanTotalBsmtSF(self):
        self.train.loc[:,'TotalBsmtSF'] = self.train['TotalBsmtSF'].fillna(0)   
        self.test.loc[:,'TotalBsmtSF'] = self.test['TotalBsmtSF'].fillna(0)
        
        
    def cleanBsmtFullBath(self):
        self.train.loc[:,'BsmtFullBath'] = self.train['BsmtFullBath'].fillna(0)
        self.test.loc[:,'BsmtFullBath'] = self.test['BsmtFullBath'].fillna(0)
        
    def cleanBsmtHalfBath(self):
        self.train.loc[:,'BsmtHalfBath'] = self.train['BsmtHalfBath'].fillna(0)
        self.test.loc[:,'BsmtHalfBath'] = self.test['BsmtHalfBath'].fillna(0)
    
    
    def cleanFireplaceQu(self):
        self.train.loc[:,'FireplaceQu'] = self.train['FireplaceQu'].fillna(0)
        self.test.loc[:,'FireplaceQu'] = self.test['FireplaceQu'].fillna(0)
    
    
    def cleanGarage(self):
        self.cleanGarageType()
        self.cleanGarageYrBlt()
        self.cleanGarageFinish()
        self.cleanGarageQual()
        self.cleanGarageCond()
        self.cleanGarageCars()
        self.cleanGarageArea()
        
        
    def cleanGarageType(self):
        self.train.loc[:,'GarageType'] = self.train['GarageType'].fillna('No_Garage')
        self.test.loc[:,'GarageType'] = self.test['GarageType'].fillna('No_Garage')
        
        
    def cleanGarageYrBlt(self):
        self.train.loc[:,'GarageYrBlt'] = self.train['GarageYrBlt'].fillna(0)
        self.test.loc[:,'GarageYrBlt'] = self.test['GarageYrBlt'].fillna(0)

        
    def cleanGarageFinish(self):
        self.train.loc[:,'GarageFinish'] = self.train['GarageFinish'].fillna('No_Garage')
        self.test.loc[:,'GarageFinish'] = self.test['GarageFinish'].fillna('No_Garage')
        
        
    def cleanGarageQual(self):
        self.train.loc[:,'GarageQual'] = self.train['GarageQual'].fillna(0)        
        self.test.loc[:,'GarageQual'] = self.test['GarageQual'].fillna(0)
        
          
    def cleanGarageCond(self):
        self.train.loc[:,'GarageCond'] = self.train['GarageCond'].fillna(0)        
        self.test.loc[:,'GarageCond'] = self.test['GarageCond'].fillna(0)
        
        
    def cleanGarageCars(self):
        self.train.loc[:,'GarageCars'] = self.train['GarageCars'].fillna(0)        
        self.test.loc[:,'GarageCars'] = self.test['GarageCars'].fillna(0)
        
        
        
    def cleanGarageArea(self):
        self.train.loc[:,'GarageArea'] = self.train['GarageArea'].fillna(0)        
        self.test.loc[:,'GarageArea'] = self.test['GarageArea'].fillna(0)
        
        
    def cleanPoolQC(self):
        self.train.loc[:,'PoolQC'] = self.train['PoolQC'].fillna(0)
        self.test.loc[:,'PoolQC'] = self.test['PoolQC'].fillna(0)
    
    
    def cleanFence(self):
        self.train.loc[:,'Fence'] = self.train['Fence'].fillna(0)
        self.test.loc[:,'Fence'] = self.test['Fence'].fillna(0)

    
    def cleanMiscFeature(self):
        self.train.loc[:,'MiscFeature'] = self.train['MiscFeature'].fillna('No_Misc_Feature')
        self.test.loc[:,'MiscFeature'] = self.test['MiscFeature'].fillna('No_Misc_Feature')
        
        
    def cleanMasVnrType(self):
        #self.imputeColumn('MasVnrType')
        self.train.loc[:,'MasVnrType'] = self.train['MasVnrType'].fillna('No_MasVnr')
        self.test.loc[:,'MasVnrType'] = self.test['MasVnrType'].fillna('No_MasVnr')

        
    def cleanMasVnrArea(self):
        #self.imputeColumn('MasVnrArea')
        self.train.loc[:,'MasVnrArea'] = self.train['MasVnrArea'].fillna(0)
        self.test.loc[:,'MasVnrArea'] = self.test['MasVnrArea'].fillna(0)
        
        
    def cleanMSSubClass(self):
        self.train.loc[:,'MSSubClass'] = self.train['MSSubClass'].astype('object')
        self.test.loc[:,'MSSubClass'] = self.test['MSSubClass'].astype('object')


    def cleanLotFrontage(self):
         self.imputeColumn('LotFrontage')
        
    def cleanElectrical(self):
        self.imputeColumn('Electrical')
        
        
    def cleanMSZoning(self):
        self.imputeColumn('MSZoning')
        
    def cleanUtilities(self):
        self.imputeColumn('Utilities')
        
        
    def cleanExterior1st(self):
        self.imputeColumn('Exterior1st')
        
    def cleanExterior2nd(self):
        self.imputeColumn('Exterior2nd')
        
    def cleanKitchenQual(self):
        self.imputeColumn('KitchenQual')
        
        
    def cleanFunctional(self):
        self.imputeColumn('Functional')
        
        
    def cleanSaleType(self):
        self.imputeColumn('SaleType')
        
        
    def imputeColumn(self, colName):
        train, test = self.dummify([colName])
        missing_train = train[train[colName].isnull()]
        not_missing_train = train[train[colName].notnull()]
        missing_test = test[test[colName].isnull()]
        
        # For all the missing observations of the target column 'colName', checks to see if the other 
        # feature variables contain any missing values as well. If they do not, use all columns when imputing 
        # the missing values for the column of interest.  If they do, drop all the observations that contain
        # a missing value when imputing for the missing values for the column of interest.  
        isNotMissingFeatureColVals_Train = sum(missing_train.columns[missing_train.isnull().sum() != 0] != colName) == 0
        isNotMissingFeatureColVals_Test = sum(missing_test.columns[missing_test.isnull().sum() != 0] != colName) == 0
        
        if (isNotMissingFeatureColVals_Train) & (isNotMissingFeatureColVals_Test):
            print('Dropping Rows')
            not_missing_train = train[train[colName].notnull()]
            not_missing_train = not_missing_train.dropna()
            not_missing_train_X = not_missing_train.loc[:,not_missing_train.columns.difference([colName,'SalePrice'])]
            not_missing_train_y = not_missing_train[colName]
            
            missing_train = missing_train.loc[:,missing_train.columns.difference([colName, 'SalePrice'])]
            missing_test = missing_test.loc[:,missing_test.columns.difference([colName, 'SalePrice'])]
             
            rf = self.randomForestImputation(not_missing_train_X, not_missing_train_y, train[colName].dtypes)
                
            if len(missing_train) != 0:
                imputed = rf.predict(missing_train)
                m = self.train[colName].isnull()
                self.train.loc[m, colName] = imputed
                
            if len(missing_test) != 0:
                imputed = rf.predict(missing_test)
                m = self.test[colName].isnull()
                self.test.loc[m, colName] = imputed  
        else:
            print('DroppingCols')
            cols_with_na_from_not_null_obs = not_missing_train.columns[not_missing_train.isnull().sum() != 0].to_list()
            cols_with_na_from_null_obs = missing_train.columns[missing_train.isnull().sum() != 0].to_list()
            target_cols = [colName,'SalePrice']
            colsToDrop = target_cols + cols_with_na_from_not_null_obs + cols_with_na_from_null_obs

            not_missing_train_X = not_missing_train.loc[:,not_missing_train.columns.difference(colsToDrop)]
            not_missing_train_y = not_missing_train[colName]

            missing_train = missing_train.loc[:,missing_train.columns.difference(colsToDrop)]
            missing_test = missing_test.loc[:,missing_test.columns.difference(colsToDrop)]
            
            rf = self.randomForestImputation(not_missing_train_X, not_missing_train_y, train[colName].dtypes)
        
            if len(missing_train) != 0:
                imputed = rf.predict(missing_train)
                m = self.train[colName].isnull()
                self.train.loc[m, colName] = imputed
                
            if len(missing_test) != 0:
                imputed = rf.predict(missing_test)
                m = self.test[colName].isnull()
                self.test.loc[m, colName] = imputed
                
    def randomForestImputation(self, train_X, train_y, forestType):
        
        # performs random forest regression or random forest classification depending on if column is categorical
        # or numeric
        if forestType == 'object':
            rf = RandomForestClassifier()
            X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.20, random_state=42)
            rf.fit(X_train, y_train)
            predictions = rf.predict(X_test)
            print(confusion_matrix(predictions, y_test))
            print(accuracy_score(predictions, y_test))
            print(precision_score(predictions, y_test, average='weighted'))
            
        else:
            rf = RandomForestRegressor()
            X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.20, random_state=42)
            rf.fit(X_train, y_train)
            predictions = rf.predict(X_test)
            print(pd.DataFrame(predictions,y_test))
            print(mean_squared_error(predictions, y_test))
        
        return rf

        
    def dummify(self, cols_not_to_dummify):
        
        # get all category type columns
        category_cols = self.train.dtypes[self.train.dtypes == 'object'].index.to_list()

        try:
            for col in cols_not_to_dummify:
                category_cols.remove(col)
        except:
            print('Column not in category cols')
        
        dum_train_temp = pd.DataFrame()
        dum_test_temp = pd.DataFrame()
        dummies_train = []
        dummies_test = []
        for col in category_cols:
            dum_train_temp = pd.get_dummies(self.train[col], prefix=f'{col}Dummy', drop_first=True)
            dum_test_temp = pd.get_dummies(self.test[col], prefix= f'{col}Dummy', drop_first=True)
            dummies_train.append(dum_train_temp)
            dummies_test.append(dum_test_temp)
            
        
        dummies_train_concatenated = pd.concat(dummies_train, axis=1)
        train = pd.concat([self.train, dummies_train_concatenated], axis=1)
        train = train.drop(category_cols, axis=1)
       
        dummies_test_concatenated = pd.concat(dummies_test, axis=1)
        test = pd.concat([self.test, dummies_test_concatenated],axis=1)
        test = test.drop(category_cols, axis=1)
        
        missingCols = set(train.columns) - set(test.columns)
        for col in missingCols:
            if col != 'SalePrice':
                test[col] = 0
                
        
        missingCols = set(test.columns) - set(train.columns)        
        for col in missingCols:
            train[col] = 0
            
        return [train, test]
    
    
    def ordinalEncode(self, ordCols):
        
        for col in ordCols:
            self.train[col] = self.train[col].astype(str)
            self.test[col] = self.test[col].astype(str)

        full = pd.concat([self.train[ordCols], self.test[ordCols]],axis=0)
        
        
        ordEnc = OrdinalEncoder()
        ordEnc.fit(full)
        self.train[ordCols] = ordEnc.transform(self.train[ordCols])
        self.test[ordCols] = ordEnc.transform(self.test[ordCols])
        
        
    def fillAllMissingValues(self):
        self.cleanAlley()
        self.cleanBsmtQual()
        self.cleanBsmtCond()
        self.cleanBsmtExposure()
        self.cleanBsmtFinType1()
        self.cleanBsmtFinType2()
        self.cleanBsmtFinSF1()
        self.cleanBsmtFinSF2()
        self.cleanBsmtUnfSF()
        self.cleanMasVnrType()
        self.cleanMasVnrArea()
        self.cleanTotalBsmtSF()
        self.cleanBsmtHalfBath()
        self.cleanBsmtFullBath()
        self.cleanFireplaceQu()
        self.cleanGarageType()
        self.cleanGarageYrBlt()
        self.cleanGarageFinish()
        self.cleanGarageQual()
        self.cleanGarageFinish()
        self.cleanGarageQual()
        self.cleanGarageCond()
        self.cleanGarageCars()
        self.cleanGarageArea()
        self.cleanPoolQC()
        self.cleanFence()
        self.cleanMiscFeature()
        self.cleanMSSubClass()
        self.cleanElectrical()
        self.cleanMSZoning()
        self.cleanUtilities()
        self.cleanExterior1st()
        self.cleanExterior2nd()
        self.cleanKitchenQual()
        self.cleanFunctional()
        self.cleanSaleType()
        self.cleanLotFrontage()
        
    def addHasGarage(self):
        self.train['hasGarage'] = self.train['GarageType'].apply(lambda x: 0 if x=='No_Garage' else 1)
        self.test['hasGarage'] = self.test['GarageType'].apply(lambda x: 0 if x=='No_Garage' else 1)


    
    def addHasBsmt(self): 
        self.train['hasBsmt'] = self.train['BsmtQual'].apply(lambda x: 0 if x==0 else 1)
        self.test['hasBsmt'] = self.test['BsmtQual'].apply(lambda x: 0 if x==0 else 1)


    def addHasAlley_Access(self): 
        self.train['hasAlley_Access'] = self.train['Alley'].apply(lambda x: 0 if x=='No_Alley_Access' else 1)
        self.test['hasAlley_Access'] = self.test['Alley'].apply(lambda x: 0 if x=='No_Alley_Access' else 1)


    def addHasFireplace(self):
        self.train['hasFireplace'] = self.train['FireplaceQu'].apply(lambda x: 0 if x==0 else 1)
        self.test['hasFireplace'] = self.test['FireplaceQu'].apply(lambda x: 0 if x==0 else 1)


    def addHasPool(self):
        self.train['hasPool'] = self.train['PoolQC'].apply(lambda x: 0 if x==0 else 1)
        self.test['hasPool'] = self.test['PoolQC'].apply(lambda x: 0 if x==0 else 1)


    def addHasFence(self): 
        self.train['hasFence'] = self.train['Fence'].apply(lambda x: 0 if x==0 else 1)
        self.test['hasFence'] = self.test['Fence'].apply(lambda x: 0 if x==0 else 1)

        
    def addHasMisc_Feature(self):
        self.train['hasMisc_Feature'] = self.train['MiscFeature'].apply(lambda x: 0 if x=='No_Misc_Feature' else 1)
        self.test['hasMisc_Feature'] = self.test['MiscFeature'].apply(lambda x: 0 if x=='No_Misc_Feature' else 1)


    def addHasMasVnr(self):
        self.train['hasMasVnr'] = self.train['MasVnrType'].apply(lambda x: 0 if x=='No_MasVnr' else 1)
        self.test['hasMasVnr'] = self.test['MasVnrType'].apply(lambda x: 0 if x=='No_MasVnr' else 1)

    def getNumFloors(self):
        self.train['NumFloors'] = self.train.apply(self.getNumFloorsHelper, axis=1)
        self.test['NumFloors'] = self.test.apply(self.getNumFloorsHelper, axis=1)

    def getNumFloorsHelper(self, row):
        count = 0
        if row['TotalBsmtSF'] != 0:
            count = count + 1
        if row['1stFlrSF'] != 0:
            count = count + 1
        if row['2ndFlrSF'] != 0:
            count = count + 1
        return count


    def addGrLivAreaHighQualitySF(self):
        self.train['GrLivAreaHighQualitySF'] = (self.train['1stFlrSF'] + self.train['2ndFlrSF']) - self.train['LowQualFinSF']
        self.test['GrLivAreaHighQualitySF'] = (self.test['1stFlrSF'] + self.test['2ndFlrSF']) - self.test['LowQualFinSF']


    def addTotalIndoorSF(self):
        self.train['TotalIndoorSF'] = self.train['TotalBsmtSF'] + self.train['GrLivAreaHighQualitySF']
        self.test['TotalIndoorSF'] = self.test['TotalBsmtSF'] + self.test['GrLivAreaHighQualitySF']


    def addTotalOutdoorSF(self):
        self.train['TotalOutdoorSF'] = self.train['OpenPorchSF'] + self.train['EnclosedPorch'] + self.train['3SsnPorch'] + self.train['ScreenPorch'] 
        + self.train['WoodDeckSF'] + self.train['PoolArea']
        
        self.test['TotalOutdoorSF'] = self.test['OpenPorchSF'] + self.test['EnclosedPorch'] + self.test['3SsnPorch'] + self.test['ScreenPorch'] 
        + self.test['WoodDeckSF'] + self.test['PoolArea']


    def addTotalSF(self):
        self.train['TotalSF'] = self.train['TotalIndoorSF'] + self.train['TotalOutdoorSF'] + self.train['GarageArea']
        self.test['TotalSF'] = self.test['TotalIndoorSF'] + self.test['TotalOutdoorSF'] + self.test['GarageArea']


    def addBsmtBaths(self):
        self.train['BsmtBaths'] = self.train['BsmtFullBath'] + (0.5 * self.train['BsmtHalfBath'])
        self.test['BsmtBaths'] = self.test['BsmtFullBath'] + (0.5 * self.test['BsmtHalfBath'])


    def addBaths(self):
        self.train['Baths'] = self.train['FullBath'] + (0.5 * self.train['HalfBath'])
        self.test['Baths'] = self.test['FullBath'] + (0.5 * self.test['HalfBath'])


    def addHasBeenRemodeled(self):
        self.train['hasBeenRemodeled'] = self.train.apply(lambda x: 0 if x['YearBuilt'] == x['YearRemodAdd'] else 1, axis=1)
        self.test['hasBeenRemodeled'] = self.test.apply(lambda x: 0 if x['YearBuilt'] == x['YearRemodAdd'] else 1, axis=1)
        
    def addAllNewFeatures(self):
        self.addHasGarage()
        self.addHasBsmt()
        self.addHasAlley_Access()
        self.addHasFireplace()
        self.addHasPool()
        self.addHasFence()
        self.addHasMisc_Feature()
        self.addHasMasVnr()
        self.addGrLivAreaHighQualitySF()
        self.addTotalIndoorSF()
        self.addTotalOutdoorSF()
        self.addTotalSF()
        self.addBsmtBaths()
        self.addBaths()
        self.addHasBeenRemodeled()
        self.getNumFloors()
        

        
    def getCategoryColumns(self):
        catColsTrain = self.train.columns[self.train.dtypes == 'object']
        catColsTest = self.test.columns[self.test.dtypes == 'object']
        
        return [catColsTrain, catColsTest]
    
    def getNumericColumns(self):
        numColsTrain = self.train.columns[self.train.dtypes != 'object']
        numColsTest = self.test.columns[self.test.dtypes != 'object']
        
        return [numColsTrain, numColsTest]
    
        
    
            
            
        
        

In [4]:
cleaner = DataCleaner(train, test)
cleaner.fillAllMissingValues()
cleaner.addAllNewFeatures()

Dropping Rows
[[  0   0   0   0]
 [  0   0   0   0]
 [  0   0   0   0]
 [ 18   5   1 216]]
0.9
1.0
DroppingCols
[[  0   0   0   0   0]
 [  0  11   0   2   0]
 [  0   0   0   0   0]
 [  3   1   1 218   8]
 [  3   0   0   7  38]]
0.9143835616438356
0.9363364735319866
DroppingCols
[[292]]
1.0
1.0
Dropping Rows
[[ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  3  0  0  0  0  0  0  1  2]
 [ 0  0  0 17  0  0  0  0  0  0  0]
 [ 0  1  1  1 32  1  0  0  0  0  2]
 [ 1  0  0  0  0 32  0  0  0  0  0]
 [ 0  0  0  0  1  0 10  0  0  1  0]
 [ 0  0  0  0  0  1  0  3  0  1  0]
 [ 0  0  0  0  0  0  0  0 85  0  1]
 [ 3  0  2  1  0  1  1  1  0 32  3]
 [ 0  0  0  0  0  0  0  0  0  0  0]]
0.8879668049792531
0.9382493861965362
Dropping Rows
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 18  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 

# Ordinal Encode Only

In [5]:
# ordinal encode columns with similar mappings
mapping = {'Po': 1, 'Fa': 2, 'TA' : 3, 'Gd' : 4, 'Ex': 5}
cleaner.train = cleaner.train.replace({'KitchenQual': mapping, 'FireplaceQu': mapping, 'GarageQual': mapping, 
                                       'GarageCond': mapping, 'PoolQC': mapping, 'HeatingQC': mapping, 
                                       'BsmtQual': mapping, 'BsmtCond': mapping,'ExterQual': mapping, 'ExterCond': mapping})

cleaner.test = cleaner.test.replace({'KitchenQual': mapping, 'FireplaceQu': mapping, 'GarageQual': mapping, 
                                     'GarageCond': mapping,'PoolQC': mapping, 'HeatingQC': mapping, 'BsmtQual': mapping, 
                                     'BsmtCond': mapping, 'ExterQual': mapping, 'ExterCond': mapping})


# ordinal encode all category columns
ordCols = cleaner.train.columns[cleaner.train.dtypes == 'object']
cleaner.ordinalEncode(ordCols)

#cleaner.train.to_csv('./data/train_filled_na_ord.csv')
#cleaner.test.to_csv('./data/test_filled_na_ord.csv')


# Dummify and Ordinal Encode

In [None]:
# Ordinal encode only columns with a natural ordering
ordCols = ['OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 
            'KitchenAbvGr','TotRmsAbvGrd', 'Fireplaces', 'GarageCars','MoSold', 'YrSold','BsmtExposure',
           'BsmtFinType1', 'BsmtFinType2','Fence','LandSlope','GarageFinish']

cleaner.ordinalEncode(ordCols)

# dummify all category columns that are not ordinal
trained, tested = cleaner.dummify(cols_not_to_dummify=ordCols)

# trained.to_csv('./data/train_filled_na.csv')
# tested.to_csv('./data/test_filled_na.csv')