In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error, mean_squared_error


In [2]:
train = pd.read_csv('train.csv',index_col='Id')
test = pd.read_csv('test.csv',index_col='Id')

In [3]:
class DataCleaner:
    
    
    def __init__(self, train, test):
        self.train = train.copy()
        self.test = test.copy()
        print(self.train.dtypes[self.train.dtypes == 'object'].index)
        
    
    def cleanAlley(self):
        self.train.loc[:,'Alley'] = self.train['Alley'].fillna('No_Alley_Access')
        self.test.loc[:,'Alley'] = self.test['Alley'].fillna('No_Alley_Access')

        
    def cleanMasVnrType(self):
        self.train = self.train[self.train['MasVnrType'].notna()]
        self.test = self.test[self.test['MasVnrType'].notna()]
        
    
    def cleanMasVnrArea(self):
        self.train = self.train[self.train['MasVnrArea'].notna()]
        self.test = self.test[self.test['MasVnrArea'].notna()]
    
    
    def cleanBsmt(self):
        self.cleanBsmtQual()
        self.cleanBsmtCond()
        self.cleanBsmtExposure()
        self.cleanBsmtFinType1()
        self.cleanBsmtFinType2()
        self.cleanBsmtFinSF1()
        self.cleanBsmtFinSF2()
        self.cleanBsmtUnfSF()
        self.cleanTotalBsmtSF()
        self.cleanBsmtFullBath()
        self.cleanBsmtHalfBath()
    
    def cleanBsmtQual(self):
        self.train.loc[:,'BsmtQual'] = self.train['BsmtQual'].fillna('No_Basement')
        self.test.loc[:,'BsmtQual'] = self.test['BsmtQual'].fillna('No_Basement')
        
        
    def cleanBsmtCond(self):
        self.train.loc[:,'BsmtCond'] = self.train['BsmtCond'].fillna('No_Basement')
        self.test.loc[:,'BsmtCond'] = self.test['BsmtCond'].fillna('No_Basement')
        
    
    def cleanBsmtExposure(self):
        self.train.loc[:,'BsmtExposure'] = self.train['BsmtExposure'].fillna('No_Basement')
        self.test.loc[:,'BsmtExposure'] = self.test['BsmtExposure'].fillna('No_Basement')
        
        
    def cleanBsmtFinType1(self):
        self.train.loc[:,'BsmtFinType1'] = self.train['BsmtFinType1'].fillna('No_Basement')
        self.test.loc[:,'BsmtFinType1'] = self.test['BsmtFinType1'].fillna('No_Basement')

        
    def cleanBsmtFinType2(self):
        self.train.loc[:,'BsmtFinType2'] = self.train['BsmtFinType2'].fillna('No_Basement')
        self.test.loc[:,'BsmtFinType2'] = self.test['BsmtFinType2'].fillna('No_Basement')
        
        
    def cleanBsmtFinSF1(self):

        self.train.loc[:,'BsmtFinSF1'] = self.train['BsmtFinSF1'].fillna('No_Basement')
        self.train['BsmtFinSF1'] = self.train['BsmtFinSF1'].astype('object')
        
        self.test.loc[:,'BsmtFinSF1'] = self.test['BsmtFinSF1'].fillna('No_Basement')
        self.test['BsmtFinSF1'] = self.test['BsmtFinSF1'].astype('object')
        

        
        self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
                  & (self.train['BsmtExposure'] == 'No_Basement') & (self.train['BsmtFinType1'] == 'No_Basement')
                  & (self.train['BsmtFinType2'] == 'No_Basement') & (self.train['BsmtFinSF1'] == 0)]['BsmtFinSF1'] = 'No_Basement'
        
        self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
                  & (self.test['BsmtExposure'] == 'No_Basement') & (self.test['BsmtFinType1'] == 'No_Basement')
                  & (self.test['BsmtFinType2'] == 'No_Basement') & (self.test['BsmtFinSF1'] == 0)]['BsmtFinSF1'] = 'No_Basement'
        

        print(self.test.dtypes)
        
    def cleanBsmtFinSF2(self):
        self.train.loc[:,'BsmtFinSF2'] = self.train['BsmtFinSF2'].fillna('No_Basement')
        self.train['BsmtFinSF2'] = self.train['BsmtFinSF2'].astype('object')
        
        self.test.loc[:,'BsmtFinSF2'] = self.test['BsmtFinSF2'].fillna('No_Basement')
        self.test['BsmtFinSF1'] = self.test['BsmtFinSF1'].astype('object')
        
        self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
                  & (self.train['BsmtExposure'] == 'No_Basement') & (self.train['BsmtFinType1'] == 'No_Basement')
                  & (self.train['BsmtFinType2'] == 'No_Basement') & (self.train['BsmtFinSF2'] == 0)]['BsmtFinSF2'] = 'No_Basement'
        
        self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
          & (self.test['BsmtExposure'] == 'No_Basement') & (self.test['BsmtFinType1'] == 'No_Basement')
          & (self.test['BsmtFinType2'] == 'No_Basement') & (self.test['BsmtFinSF2'] == 0)]['BsmtFinSF2'] = 'No_Basement'
        

    def cleanBsmtUnfSF(self):
        self.train.loc[:,'BsmtUnfSF'] = self.train['BsmtUnfSF'].fillna('No_Basement')
        self.train['BsmtUnfSF'] = self.train['BsmtUnfSF'].astype('object')
        
        self.test.loc[:,'BsmtUnfSF'] = self.test['BsmtUnfSF'].fillna('No_Basement')
        self.test['BsmtUnfSF'] = self.test['BsmtUnfSF'].astype('object')
        

        self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
          & (self.train['BsmtExposure'] == 'No_Basement') & (self.train['BsmtFinType1'] == 'No_Basement')
          & (self.train['BsmtFinType2'] == 'No_Basement') & (self.train['BsmtUnfSF'] == 0)]['BsmtUnfSF'] = 'No_Basement'
        
        self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
          & (self.test['BsmtExposure'] == 'No_Basement') & (self.test['BsmtFinType1'] == 'No_Basement')
          & (self.test['BsmtFinType2'] == 'No_Basement') & (self.test['BsmtUnfSF'] == 0)]['BsmtUnfSF'] = 'No_Basement'
        
        
    def cleanTotalBsmtSF(self):
        self.train.loc[:,'TotalBsmtSF'] = self.train['TotalBsmtSF'].fillna('No_Basement')
        self.train['TotalBsmtSF'] = self.train['TotalBsmtSF'].astype('object')
        
        self.test.loc[:,'TotalBsmtSF'] = self.test['TotalBsmtSF'].fillna('No_Basement')
        self.test['TotalBsmtSF'] = self.test['TotalBsmtSF'].astype('object')
        

        self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
          & (self.train['BsmtExposure'] == 'No_Basement') & (self.train['BsmtFinType1'] == 'No_Basement')
          & (self.train['BsmtFinType2'] == 'No_Basement') & (self.train['TotalBsmtSF'] == 0)]['TotalBsmtSF'] = 'No_Basement'
        
        self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
          & (self.test['BsmtExposure'] == 'No_Basement') & (self.test['BsmtFinType1'] == 'No_Basement')
          & (self.test['BsmtFinType2'] == 'No_Basement') & (self.test['TotalBsmtSF'] == 0)]['TotalBsmtSF'] = 'No_Basement'

        
        
    def cleanBsmtFullBath(self):
        self.train.loc[:,'BsmtFullBath'] = self.train['BsmtFullBath'].fillna('No_Basement')
        self.train['BsmtFullBath'] = self.train['BsmtFullBath'].astype('object')

        
        self.test.loc[:,'BsmtFullBath'] = self.test['BsmtFullBath'].fillna('No_Basement')
        self.test['BsmtFullBath'] = self.test['BsmtFullBath'].astype('object')
        
        
        self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
          & (self.train['BsmtExposure'] == 'No_Basement') & (self.train['BsmtFinType1'] == 'No_Basement')
          & (self.train['BsmtFinType2'] == 'No_Basement') & (self.train['BsmtFullBath'] == 0)]['BsmtFullBath'] = 'No_Basement'
        
        self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
          & (self.test['BsmtExposure'] == 'No_Basement') & (self.test['BsmtFinType1'] == 'No_Basement')
          & (self.test['BsmtFinType2'] == 'No_Basement') & (self.test['BsmtFullBath'] == 0)]['BsmtFullBath'] = 'No_Basement'

        
        
    def cleanBsmtHalfBath(self):
        self.train.loc[:,'BsmtHalfBath'] = self.train['BsmtHalfBath'].fillna('No_Basement')
        self.train['BsmtHalfBath'] = self.train['BsmtHalfBath'].astype('object')

        self.test.loc[:,'BsmtHalfBath'] = self.test['BsmtHalfBath'].fillna('No_Basement')
        self.test['BsmtHalfBath'] = self.test['BsmtHalfBath'].astype('object')
        
        
        self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
          & (self.train['BsmtExposure'] == 'No_Basement') & (self.train['BsmtFinType1'] == 'No_Basement')
          & (self.train['BsmtFinType2'] == 'No_Basement') & (self.train['BsmtHalfBath'] == 0)]['BsmtHalfBath'] = 'No_Basement'
        
        self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
          & (self.test['BsmtExposure'] == 'No_Basement') & (self.test['BsmtFinType1'] == 'No_Basement')
          & (self.test['BsmtFinType2'] == 'No_Basement') & (self.test['BsmtHalfBath'] == 0)]['BsmtHalfBath'] = 'No_Basement'
        
        
    def cleanElectrical(self):
        self.train = self.train[self.train['Electrical'].notna()]
        self.test = self.test[self.test['Electrical'].notna()]
    
    
    def cleanFireplaceQu(self):
        self.train.loc[:,'FireplaceQu'] = self.train['FireplaceQu'].fillna('No_Fireplace')
        self.test.loc[:,'FireplaceQu'] = self.test['FireplaceQu'].fillna('No_Fireplace')
    
    
    def cleanGarage(self):
        self.cleanGarageType()
        self.cleanGarageYrBlt()
        self.cleanGarageFinish()
        self.cleanGarageQual()
        self.cleanGarageCond()
        self.cleanGarageCars()
        self.cleanGarageArea()
        
        
    def cleanGarageType(self):
        self.train.loc[:,'GarageType'] = self.train['GarageType'].fillna('No_Garage')
        self.test.loc[:,'GarageType'] = self.test['GarageType'].fillna('No_Garage')
        
        
    def cleanGarageYrBlt(self):
        self.train.loc[:,'GarageYrBlt'] = self.train['GarageYrBlt'].fillna('No_Garage')
        self.train['GarageYrBlt'] = self.train['GarageYrBlt'].astype('object')
        
        self.test.loc[:,'GarageYrBlt'] = self.test['GarageYrBlt'].fillna('No_Garage')
        self.test['GarageYrBlt'] = self.test['GarageYrBlt'].astype('object')
        
        
    def cleanGarageFinish(self):
        self.train.loc[:,'GarageFinish'] = self.train['GarageFinish'].fillna('No_Garage')
        self.train['GarageFinish'] = self.train['GarageFinish'].astype('object')
        
        self.test.loc[:,'GarageFinish'] = self.test['GarageFinish'].fillna('No_Garage')
        self.test['GarageFinish'] = self.test['GarageFinish'].astype('object')
        
        
    def cleanGarageQual(self):
        self.train.loc[:,'GarageQual'] = self.train['GarageQual'].fillna('No_Garage')
        self.train['GarageQual'] = self.train['GarageQual'].astype('object')
        
        self.test.loc[:,'GarageQual'] = self.test['GarageQual'].fillna('No_Garage')
        self.test['GarageQual'] = self.test['GarageQual'].astype('object')
        
          
    def cleanGarageCond(self):
        self.train.loc[:,'GarageCond'] = self.train['GarageCond'].fillna('No_Garage')
        self.train['GarageCond'] = self.train['GarageCond'].astype('object')
        
        self.test.loc[:,'GarageCond'] = self.test['GarageCond'].fillna('No_Garage')
        self.test['GarageCond'] = self.test['GarageCond'].astype('object')
        
        
    def cleanGarageCars(self):
        self.train.loc[:,'GarageCars'] = self.train['GarageCars'].fillna('No_Garage')
        self.train['GarageCars'] = self.train['GarageCars'].astype('object')
        
        self.test.loc[:,'GarageCars'] = self.test['GarageCars'].fillna('No_Garage')
        self.test['GarageCars'] = self.test['GarageCars'].astype('object')
        
        
        self.train[(self.train['GarageType'] == 'No_Garage') & (self.train['GarageYrBlt'] == 'No_Garage')
          & (self.train['GarageFinish'] == 'No_Garage') & (self.train['GarageQual'] == 'No_Garage')
          & (self.train['GarageCond'] == 'No_Garage') & (self.train['GarageCars'] == 0)]['GarageCars'] = 'No_Garage'
        
        self.test[(self.test['GarageType'] == 'No_Garage') & (self.test['GarageYrBlt'] == 'No_Garage')
          & (self.test['GarageFinish'] == 'No_Garage') & (self.test['GarageQual'] == 'No_Garage')
          & (self.test['GarageCond'] == 'No_Garage') & (self.test['GarageCars'] == 0)]['GarageCars'] = 'No_Garage'
        
        
    def cleanGarageArea(self):
        self.train.loc[:,'GarageArea'] = self.train['GarageArea'].fillna('No_Garage')
        self.train['GarageArea'] = self.train['GarageArea'].astype('object')
        
        self.test.loc[:,'GarageArea'] = self.test['GarageArea'].fillna('No_Garage')
        self.test['GarageArea'] = self.test['GarageArea'].astype('object')
        
        
        self.train[(self.train['GarageType'] == 'No_Garage') & (self.train['GarageYrBlt'] == 'No_Garage')
          & (self.train['GarageFinish'] == 'No_Garage') & (self.train['GarageQual'] == 'No_Garage')
          & (self.train['GarageCond'] == 'No_Garage') & (self.train['GarageArea'] == 0)]['GarageArea'] = 'No_Garage'
        
        self.test[(self.test['GarageType'] == 'No_Garage') & (self.test['GarageYrBlt'] == 'No_Garage')
          & (self.test['GarageFinish'] == 'No_Garage') & (self.test['GarageQual'] == 'No_Garage')
          & (self.test['GarageCond'] == 'No_Garage') & (self.test['GarageArea'] == 0)]['GarageArea'] = 'No_Garage'
        
        
    def cleanPoolQC(self):
        self.train.loc[:,'PoolQC'] = self.train['PoolQC'].fillna('No_Pool')
        self.test.loc[:,'PoolQC'] = self.test['PoolQC'].fillna('No_Pool')
    
    
    def cleanFence(self):
        self.train.loc[:,'Fence'] = self.train['Fence'].fillna('No_Fence')
        self.test.loc[:,'Fence'] = self.test['Fence'].fillna('No_Fence')

    
    def cleanMiscFeature(self):
        self.train.loc[:,'MiscFeature'] = self.train['MiscFeature'].fillna('No_Misc_Feature')
        self.test.loc[:,'MiscFeature'] = self.test['MiscFeature'].fillna('No_Misc_Feature')

    
    def cleanLotFrontage(self):
        test_na = self.train[self.train['LotFrontage'].isnull()]
        train_na = self.train[self.train['LotFrontage'].notnull()]
        test_na_X = test_na.drop(['LotFrontage'], axis=1)
        train_na_X = train_na.drop(['LotFrontage'], axis=1)
        train_na_y = train_na['LotFrontage']
        rf = RandomForestRegressor()
        rf.fit(train_na_X, train_na_y)
        lot_frontage_imputed = rf.predict(test_na_X)
        m = self.train['LotFrontage'].isnull()
        self.train.loc[m, 'LotFrontage'] = lot_frontage_imputed
        
        test_na = self.test[self.test['LotFrontage'].isnull()]
        train_na = self.test[self.test['LotFrontage'].notnull()]
        test_na_X = test_na.drop(['LotFrontage'], axis=1)
        train_na_X = train_na.drop(['LotFrontage'], axis=1)
        train_na_y = train_na['LotFrontage']
        rf = RandomForestRegressor()
        rf.fit(train_na_X, train_na_y)
        lot_frontage_imputed = rf.predict(test_na_X)
        m = self.test['LotFrontage'].isnull()
        self.test.loc[m, 'LotFrontage'] = lot_frontage_imputed
    
    def cleanMSZoning(self):
        self.train = self.train[self.train['MSZoning'].notna()]
        self.test = self.test[self.test['MSZoning'].notna()]
        
        
    def cleanUtilities(self):
        self.train = self.train[self.train['Utilities'].notna()]
        self.test = self.test[self.test['Utilities'].notna()]
        
        
    def cleanExterior1st(self):
        self.train = self.train[self.train['Exterior1st'].notna()]
        self.test = self.test[self.test['Exterior1st'].notna()]
        
    def cleanExterior2nd(self):
        self.train = self.train[self.train['Exterior2nd'].notna()]
        self.test = self.test[self.test['Exterior2nd'].notna()]
        
    def cleanKitchenQual(self):
        self.train = self.train[self.train['KitchenQual'].notna()]
        self.test = self.test[self.test['KitchenQual'].notna()]
        
        
    def cleanFunctional(self):
        self.train = self.train[self.train['Functional'].notna()]
        self.test = self.test[self.test['Functional'].notna()]
        
        
    def cleanSaleType(self):
        self.train = self.train[self.train['SaleType'].notna()]
        self.test = self.test[self.test['SaleType'].notna()]
        
        
    def cleanAllColumns(self):
        self.cleanAlley()
        self.cleanBsmt()
        self.cleanElectrical()
        self.cleanFence()
        self.cleanFireplaceQu()
        self.cleanGarage()
        self.cleanMasVnrArea()
        self.cleanMasVnrType()
        self.cleanMiscFeature()
        self.cleanPoolQC()
        self.cleanMSZoning()
        self.cleanUtilities()
        self.cleanExterior1st()
        self.cleanExterior2nd()
        self.cleanKitchenQual()
        self.cleanFunctional()
        self.cleanSaleType() 

        
    def dummify(self):
        cat_cols = self.train.dtypes[self.train.dtypes == 'object'].index
        
        dummies_train = []
        dummies_test = []
        for col in cat_cols:
            dummies_train.append(pd.get_dummies(self.train[col], prefix=f'{col}Dummy', drop_first=True))
            dummies_test.append(pd.get_dummies(self.test[col], prefix= f'{col}Dummy', drop_first=True))
        
        
        self.train = pd.concat([self.train, pd.concat(dummies_train, axis=1)],axis=1)
        self.train = self.train.drop(cat_cols, axis=1)

        
        self.test = pd.concat([self.test, pd.concat(dummies_test, axis=1)],axis=1)
        self.test = self.test.drop(cat_cols, axis=1)
        
        missingCols = set(self.train.columns) - set(self.test.columns)
        for col in missingCols:
            if col != 'SalePrice':
                self.test[col] = 0
            
        missingCols = set(self.test.columns) - set(self.train.columns)        
        for col in missingCols:
            self.train[col] = 0
            
            
        
        

In [4]:
cleaned = DataCleaner(train, test)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [5]:
cleaned.cleanAllColumns()

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train[(self.train['BsmtQual'] == 'No_Basement') & (self.train['BsmtCond'] == 'No_Basement')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.test[(self.test['BsmtQual'] == 'No_Basement') & (self.test['BsmtCond'] == 'No_Basement')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train[(se

In [6]:
data = cleaned.dummify()

In [7]:
cleaned.cleanLotFrontage()

In [8]:
X = cleaned.train.loc[:, cleaned.train.columns != 'SalePrice']
y = cleaned.train['SalePrice']

In [9]:
forest = RandomForestRegressor()
params = {'n_estimators': [100]}
grid = GridSearchCV(forest, param_grid=params, cv=5)
grid.fit(X, y)
grid.cv_results_

{'mean_fit_time': array([8.01847563]),
 'std_fit_time': array([0.0602919]),
 'mean_score_time': array([0.03867011]),
 'std_score_time': array([0.00162997]),
 'param_n_estimators': masked_array(data=[100],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 100}],
 'split0_test_score': array([0.86949371]),
 'split1_test_score': array([0.83330053]),
 'split2_test_score': array([0.86096663]),
 'split3_test_score': array([0.88328509]),
 'split4_test_score': array([0.78801914]),
 'mean_test_score': array([0.84701302]),
 'std_test_score': array([0.03371664]),
 'rank_test_score': array([1], dtype=int32)}