# Pre - Processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from Helpers import extract_cat_features, extract_num_features, minus_1_imputer, most_frequent_imputer

In [2]:
class preprocessor:
    
    def __init__(self, cols_to_filter = None, cols_to_impute_minus_1 = None, cols_to_impute_most_frequent = None):
        
        self.cols_to_filter = cols_to_filter
        self.cols_to_impute_minus_1 = cols_to_impute_minus_1
        self.cols_to_impute_most_frequent = cols_to_impute_most_frequent
        self.was_fit = False
        
    def fit(self, x, y=None):
        
        self.was_fit = True
        
        # filter
        x_new = x.drop(self.cols_to_filter, axis=1)
        
        
        # impute
        x_new = minus_1_imputer(x_new, self.cols_to_impute_minus_1)
        x_new = minus_1_imputer(x_new, self.cols_to_impute_most_frequent)
             
        
        # dummy code
        self.categorical_features = extract_cat_features(x_new)
        dummied = pd.get_dummies(x_new, columns = self.categorical_features, dummy_na=True, drop_first = True)
        self.colnames = dummied.columns
        del dummied
        
        return self
        
    def transform(self, x, y=None):
        
        if not self.was_fit:
            raise Error("need to fit preprocessor first")
            
        # filter   
        x_new = x.drop(self.cols_to_filter, axis = 1)
        
        # impute
        x_new = minus_1_imputer(x_new, self.cols_to_impute_minus_1)
        x_new = minus_1_imputer(x_new, self.cols_to_impute_most_frequent)
    
        # dummy code
        x_new = pd.get_dummies(x_new, columns=self.categorical_features, dummy_na=True, drop_first = True)
        newcols = set(self.colnames) - set(x_new.columns)
        for x in newcols:
            x_new[x] = 0
            
        x_new = x_new[self.colnames]  
        return x_new
    
    def fit_transform(self, x, y=None):
        """fit and transform wrapper method, used for sklearn pipeline"""

        return self.fit(x).transform(x)
        

## Data Load

In [3]:
from Helpers import x_y_split, read_dataframe

In [4]:
train = read_dataframe(path = 'train.csv', analyze = False)

In [5]:
x, y = x_y_split(train)

In [6]:
x.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [35]:
p = preprocessor(cols_to_filter=['1stFlrSF',
                                 '3SsnPorch',
                                 'Alley',
                                 'BldgType',
                                 'BsmtFinSF2',
                                 'BsmtFinType1',
                                 'BsmtFinType2',
                                 'BsmtHalfBath',
                                 'Condition1',
                                 'Condition2',
                                 'EnclosedPorch',
                                 'Exterior1st',
                                 'Exterior2nd',
                                 'Fence',
                                 'Functional',
                                 'GarageCars',
                                 'GarageCond',
                                 'GarageType',
                                 'GarageYrBlt',
                                 'HeatingQC',
                                 'HouseStyle',
                                 'Id',
                                 'KitchenAbvGr',
                                 'LandContour',
                                 'LandSlope',
                                 'LowQualFinSF',
                                 'MSSubClass',
                                 'MiscFeature',
                                 'MiscVal',
                                 'MoSold',
                                 'PavedDrive',
                                 'PoolArea',
                                 'PoolQC',
                                 'RoofMatl',
                                 'RoofStyle',
                                 'TotRmsAbvGrd',
                                 'Utilities',
                                 'YrSold'],                 
                cols_to_impute_minus_1 = ['LotFrontage','FireplaceQu', 
                                     'GarageFinish', 'GarageQual', 'MasVnrType','Electrical'],
                cols_to_impute_most_frequent = ['MasVnrArea','BsmtQual', 'BsmtCond', 'BsmtExposure', 
                                     'GarageFinish', 'GarageQual', 'MasVnrType','OverallCond'])
                


In [37]:
x_transformed = p.fit_transform(x)
x_transformed.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,65.0,8450,7,5,2003,2003,196.0,706,150,856,...,0,0,1,0,0,0,0,1,0,0
1,80.0,9600,6,8,1976,1976,0.0,978,284,1262,...,0,0,1,0,0,0,0,1,0,0
2,68.0,11250,7,5,2001,2002,162.0,486,434,920,...,0,0,1,0,0,0,0,1,0,0
3,60.0,9550,7,5,1915,1970,0.0,216,540,756,...,0,0,1,0,0,0,0,0,0,0
4,84.0,14260,8,5,2000,2000,350.0,655,490,1145,...,0,0,1,0,0,0,0,1,0,0


In [38]:
assert all(x_transformed.isna().sum() == 0)

In [39]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x_transformed, y, test_size = 0.333, random_state = 365, shuffle = True)

In [41]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
r2_score(y_test, predictions)

0.9067296043059042