
## **Ames House Price Prediction using MLPRegressor Pipeline**


### **Introduction**

This is a continuation of this project. Pipeline was built using MLPRegressor as the estimator.


In [1]:
import numpy as np
import pandas as pd
import time
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
train= pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500


In [5]:
class TransformColumns(BaseEstimator, TransformerMixin):
  
  def __init__(self, transform_ordcols = True):
    self.transform_ordcols = transform_ordcols

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Transforms ordinal features to discrete values
    
    Returns:
            X: the modified dataset      
    """
    if self.transform_ordcols:
      X['LotShape'] = X['LotShape'].replace({'Reg': 3, 'IR1': 2, 'IR2': 2, 'IR3': 1})
      X['Utilities'] = X['Utilities'].replace({'AllPub': 2, 'NoSeWa': 1})
      X['LandSlope'] = X['LandSlope'].replace({'Gtl': 3, 'Mod': 2, 'Sev': 1})
      X['ExterQual'] = X['ExterQual'].replace({'Gd': 3, 'TA': 2, 'Ex': 4, 'Fa': 1})
      X['BsmtQual'] = X['BsmtQual'].replace({'Gd': 3, 'TA': 2, 'Ex': 4, 'Fa': 1})
      X['BsmtCond'] = X['BsmtCond'].replace({'TA':3, 'Gd':4, 'Fa':2, 'Po': 1})
      X['BsmtExposure'] = X['BsmtExposure'].replace({'No': 1,  'Gd': 4, 'Mn': 2, 'Av': 3})
      X['BsmtFinType1'] = X['BsmtFinType1'].replace({'GLQ': 3, 'ALQ': 2,  'Unf': 1, 'Rec': 2, 'BLQ': 1, 'LwQ': 1})
      X['BsmtFinType2'] = X['BsmtFinType2'].replace({'GLQ': 3, 'ALQ': 2,  'Unf': 1, 'Rec': 2, 'BLQ': 1, 'LwQ': 1})
      X['HeatingQC'] = X['HeatingQC'].replace({'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1})
      X['Electrical'] = X['Electrical'].replace({'SBrkr': 4, 'FuseF': 2, 'FuseA': 3, 'FuseP': 1, 'Mix': 3 })
      X['KitchenQual'] = X['KitchenQual'].replace({'Gd':3, 'TA': 2, 'Ex': 4, 'Fa': 1})
      X['Functional'] = X['Functional'].replace({'Typ': 4, 'Min1': 3, 'Maj1': 2, 'Min2': 3,  'Mod': 3, 'Maj2': 2, 'Sev': 1})
      X['FireplaceQu'] = X['FireplaceQu'].replace({'TA': 2, 'Gd': 3, 'Fa': 2, 'Ex': 4,  'Po': 1})

      X['GarageType'] = X['GarageType'].replace({'Attchd':2, 'Detchd': 1, 'BuiltIn': 2, 'CarPort': 1, 'Basment': 2, '2Types': 2})

      X['GarageFinish'] = X['GarageFinish'].replace({'RFn':2,  'Unf':1, 'Fin': 3})

      X['GarageQual'] = X['GarageQual'].replace({'TA': 3, 'Fa': 2, 'Gd': 4, 'Ex': 5, 'Po': 1})
      X['GarageCond'] = X['GarageCond'].replace({'TA': 3, 'Fa': 2, 'Gd': 4, 'Ex': 5, 'Po': 1})
      X['PavedDrive'] = X['PavedDrive'].replace({'Y': 3, 'N': 1, 'P': 2})
      X['PoolQC'] = X['PoolQC'].replace({'Ex':3, 'Fa': 2, 'Gd':1})
      X['Fence'] = X['Fence'].replace({'MnPrv': 3, 'GdWo':2, 'GdPrv': 4, 'MnWw': 1})
      X['CentralAir'] = X['CentralAir'].replace({'Y': 1, 'N': 0})
      X['ExterCond'] = X['ExterCond'].replace({'TA': 3, 'Gd': 4, 'Fa': 2, 'Ex': 5, 'Po': 1})
      return X



In [6]:
class InputMissing(BaseEstimator, TransformerMixin):
  def __init__(self, input_missingsvals = True):
    self.input_missingsvals = input_missingsvals

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Inputes missing values
    
    Returns:
            X: the modified dataset      
    """
    if self.input_missingsvals:
      X['LotFrontage'] = X['LotFrontage'].fillna(X['LotFrontage'].median())
      X['MasVnrArea'] = X['MasVnrArea'].fillna(X['MasVnrArea'].median())
      X['GarageYrBlt'] = X['GarageYrBlt'].fillna(0)
      X['MasVnrType'] = X['MasVnrType'].fillna(X['MasVnrType'].mode()[0])
      ordinal_cat_features= ['LotShape', 'Utilities','LandSlope','ExterQual','BsmtQual', 'BsmtCond', 'BsmtExposure','BsmtFinType1', 
                       'BsmtFinType2','HeatingQC', 'Electrical', 'KitchenQual','Functional', 'FireplaceQu',
                       'GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'CentralAir', 'ExterCond']

      X[ordinal_cat_features]= X[ordinal_cat_features].fillna(0)
    return X

In [7]:
class AddPolynomials(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Generates polynomial features
    
    Returns:
            X: the modified dataset      
    """
    X["OverallQual_2"] = X["OverallQual"] ** 2
    X["OverallQual_3"] = X["OverallQual"] ** 3
    X["OverallQual_sq"] = np.sqrt(X["OverallQual"])

    X["AllSF_2"] = X["AllSF"] ** 2
    X["AllSF_3"] = X["AllSF"] ** 3
    X["AllSF_sq"] = np.sqrt(X["AllSF"])

    X["GrLivArea_2"] = X["GrLivArea"] ** 2
    X["GrLivArea_3"] = X["GrLivArea"] ** 3
    X["GrLivArea_sq"] = np.sqrt(X["GrLivArea"]
                                              )
    X["Median_neigh_area_2"] = X["Median_neigh_area"] ** 2
    X["Median_neigh_area_3"] = X["Median_neigh_area"] ** 3
    X["Median_neigh_area_sq"] = np.sqrt(X["Median_neigh_area"])

    X["GarageCars_2"] = X["GarageCars"] ** 2
    X["GarageCars_3"] = X["GarageCars"] ** 3
    X["GarageCars_sq"] = np.sqrt(X["GarageCars"])

    X["TotalBath_2"] = X["TotalBath"] ** 2
    X["TotalBath_3"] = X["TotalBath"] ** 3
    X["TotalBath_sq"] = np.sqrt(X["TotalBath"])

    X["GarageScore_2"] = X["GarageScore"] ** 2
    X["GarageScore_3"] = X["GarageScore"] ** 3
    X["GarageScore_sq"] = np.sqrt(X["GarageScore"])

    X["GarageGrade_2"] = X["GarageGrade"] ** 2
    X["GarageGrade_3"] = X["GarageGrade"] ** 3
    X["GarageGrade_sq"] = np.sqrt(X["GarageGrade"])

    X["GarageArea_2"] = X["GarageArea"] ** 2
    X["GarageArea_3"] = X["GarageArea"] ** 3
    X["GarageArea_sq"] = np.sqrt(X["GarageArea"])

    X["ExterGrade_2"] = X["ExterGrade"] ** 2
    X["ExterGrade_3"] = X["ExterGrade"] ** 3
    X["ExterGrade_sq"] = np.sqrt(X["ExterGrade"])

    X["TotalBsmtSF_2"] = X["TotalBsmtSF"] ** 2
    X["TotalBsmtSF_3"] = X["TotalBsmtSF"] ** 3
    X["TotalBsmtSF_sq"] = np.sqrt(X["TotalBsmtSF"])

    X["1stFlrSF_2"] = X["1stFlrSF"] ** 2
    X["1stFlrSF_3"] = X["1stFlrSF"] ** 3
    X["1stFlrSF_sq"] = np.sqrt(X["1stFlrSF"])

    X["OverallGrade_2"] = X["OverallGrade"] ** 2
    X["OverallGrade_3"] = X["OverallGrade"] ** 3
    X["OverallGrade_sq"] = np.sqrt(X["OverallGrade"])

    X["Bldg_1Fam_2"] = X["Bldg_1Fam"] ** 2
    X["Bldg_1Fam_3"] = X["Bldg_1Fam"] ** 3
    X["Bldg_1Fam_sq"] = np.sqrt(X["Bldg_1Fam"])

    X["FullBath_2"] = X["FullBath"] ** 2
    X["FullBath_3"] = X["FullBath"] ** 3
    X["FullBath_sq"] = np.sqrt(X["FullBath"])
    return X

In [8]:
class GenerateFeatures(BaseEstimator, TransformerMixin):
  def __init__(self, add_features = True):
    self.add_features = add_features

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Generates new features
    
    Returns:
            X: the modified dataset      
    """
    if self.add_features:
      X["LivLotRatio"] = X["GrLivArea"] / X["LotArea"]
      X["Spaciousness"] = (X["1stFlrSF"] + X["2ndFlrSF"]) / X["TotRmsAbvGrd"]
      X["TotalOutsideSF"] = X['WoodDeckSF'] + X['OpenPorchSF'] + X['EnclosedPorch'] + X["3SsnPorch"] + X["ScreenPorch"]
      X["PorchCount"] = X[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

      X["OverallGrade"] = X["OverallQual"] * X["OverallCond"]
      X["GarageGrade"] = X["GarageQual"]* X["GarageCond"]
      X["ExterGrade"] = X["ExterQual"] * X["ExterCond"]
      X["KitchenGrade"] = X["KitchenAbvGr"] * X["KitchenQual"]
      X["FireplaceGrade"] = X["Fireplaces"] * X["FireplaceQu"]
      X["GarageGrade"] = X["GarageArea"] * X["GarageQual"]
      X["GarageScore"] = X["GarageArea"] * X["GarageQual"]
      X["TotalBath"] = X["BsmtFullBath"] + (0.5 * X["BsmtHalfBath"]) + X["FullBath"] + (0.5 * X["HalfBath"])
      X["AllSF"] = X["GrLivArea"] + X["TotalBsmtSF"]
      X["house_age"] = X["YearBuilt"].apply(lambda x: pd.datetime.now().year - x)
      X["no_years_since_sold"] = X["YrSold"].apply(lambda x: pd.datetime.now().year - x)
      BldgType_dummy = pd.get_dummies(X.BldgType, prefix="Bldg")
      new_features1 = BldgType_dummy.mul(X.GrLivArea, axis=0)
      X["RecentRemodLargeBsmt"] = X.YearRemodAdd * X.TotalBsmtSF
      new_features2= pd.DataFrame()
      new_features2["Median_neigh_area"] = X.groupby("Neighborhood")["GrLivArea"].transform("median")
      X = X.join([new_features1, new_features2])
    return X

In [9]:
class RemoveSkewness(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Eliminates skewness from the skewed features
    
    Returns:
            X: the modified dataset      
    """
    X_numeric= X.select_dtypes(exclude=['object']).drop(['Id'], axis= 1)
    skewness = X_numeric.apply(lambda x: skew(x))
    skewness = skewness[abs(skewness) > 0.5]
    skewed_features = skewness.index
    X[skewed_features] = np.log1p(X[skewed_features])
    return X

In [10]:
class LabelEncode(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Transforms nominal features to discrete values
    
    Returns:
            X: the modified dataset      
    """
    cat_cols= ['MSZoning', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 
                                                  'Heating', 'SaleType', 'SaleCondition']
    for cat_col in cat_cols:
      encoded, categories = pd.factorize(X[cat_col])
      X[cat_col]= encoded
    return X



In [11]:

class DropColumns(BaseEstimator, TransformerMixin):
  def __init__(self, remove_cols = True):
    self.remove_cols= remove_cols

  def fit(self, X, y=None):
    return self
    
  def transform(self, X, y=None):
    """
    Deletes unwanted columns
    
    Returns:
            X: the modified dataset      
    """
    if self.remove_cols:
      X.drop(['Id','GarageYrBlt','GarageCond', 'Utilities', 'Alley', 'MiscFeature', 'YearBuilt', 'YrSold', 'Street'], axis= 1, inplace= True)
    return X


In [12]:
X= train.drop('SalePrice', axis=1)
y= train['SalePrice']
X_train, X_valid, y_train, y_valid= train_test_split(X, y, test_size= 0.2, random_state= 2)

In [13]:
transformer_pipeline= Pipeline(steps=[
    ('transformer', TransformColumns()),
    ('imputer', InputMissing()),
    ('remove_skewness', RemoveSkewness()),
    ('generate_features', GenerateFeatures()),
    ('encoder', LabelEncode()),
    ('remove_columns', DropColumns()),
    ('polynomials', AddPolynomials()),
    ('scaler', StandardScaler())])

In [14]:
# transformer_pipeline= Pipeline(steps=[
#     ('transformer', TransformColumns()),
#     ('imputer', InputMissing()),
#     ('remove_skewness', RemoveSkewness()),
#     ('generate_features', GenerateFeatures()),
#     ('encoder', LabelEncode()),
#     ('remove_columns', DropColumns()),
#     ('scaler', StandardScaler())])

In [15]:
start = time.time()
pipe_MLP= Pipeline(steps = [ ( 'transformer_list', transformer_pipeline),
                                  
                                  ( 'model', MLPRegressor(
                      hidden_layer_sizes=(100,100,100,100,100),
                      alpha=0,
                      max_iter=500,
                      random_state= 200
                  ) ) ] )
pipe_MLP.fit(X_train, y_train)

MLP_training_time= time.time() - start

In [16]:
start = time.time()
y_pred_MLP = pipe_MLP.predict(X_valid)

MLP_inference_time= time.time() - start
rmse_MLP= mean_squared_error(np.log(y_valid), np.log(y_pred_MLP),  squared= False)

print('Validation root mean squared error is', rmse_MLP)

Validation root mean squared error is 1.7802497908915336


In [18]:
results = pd.DataFrame({'Model':['Neural nets'], 'Training time':[MLP_training_time], 
                          'Inference time':[MLP_inference_time], 'rmse':[rmse_MLP]})
results

Unnamed: 0,Model,Training time,Inference time,rmse
0,Neural nets,25.287365,0.164841,1.78025


This achieved a worse RMSE score than the other two algorithms, GradientBoostingRegressor and xgboostRegressor. This could be due to the high dimensionality of the dataset