In [23]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
import category_encoders as ce
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import make_pipeline

from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

train_raw, test_raw = pd.read_csv('~/Downloads/house-pricing/train.csv'), pd.read_csv('~/Downloads/house-pricing/test.csv')

In [24]:
categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
              'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
              'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
# Street, Utilities, Condition2, RoofMatl: bad repartition
# Alley, PoolQC: most are missing

numerical = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
             'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'OverallCond', 'OverallQual']
# MasVnrArea, BsmtFinSF1, 2ndFlrSF, LowQualFinSF, PoolArea: Had a lot of 0.0
date_related = ['GarageYrBlt', 'MoSold', 'YrSold']
boolean = ['CentralAir']
target = 'SalePrice'
to_drop = ['Id', 'PoolArea', 'PoolQC', 'YearBuilt', 'YearRemodAdd']

assert len(categorical )+ len(numerical) + len(date_related) + len(boolean) + len(to_drop) + 1 == train_raw.shape[1]
assert set(categorical + numerical + date_related + boolean + [target] + to_drop) == set(train_raw.columns)



In [25]:
train = train_raw.copy()
train['TotalArea'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['OpenPorchSF']
train['LivibleRatio'] = train['1stFlrSF'] / train['LotArea']
train['HasMasVnr'] = train['MasVnrArea'] == 0
train['BuildingAge'] = 2020 - train['YearBuilt']
train['LastRemodling'] = 2020 - train['YearRemodAdd']
train = train.drop(columns=to_drop, axis=1)

In [26]:
train[numerical + date_related] = train[numerical + date_related].fillna(0)

features = ['Street', 'OverallQual', 'BsmtQual', 'FullBath', 'GarageCars', 'LastRemodling', 'BuildingAge', 'MiscVal', '2ndFlrSF', 'LotArea']
X_train, X_test = train_test_split(train, test_size=0.3)
Y_train, Y_test = np.log(X_train.pop(target)), np.log(X_test.pop(target))


encoder = ce.TargetEncoder(cols=categorical + boolean)
encoder.fit(X_train, Y_train)
X_train, X_test = encoder.transform(X_train, Y_train), encoder.transform(X_test, Y_test)


  elif pd.api.types.is_categorical(cols):


In [27]:
model = RandomForestRegressor(n_jobs=-1)
model.fit(X_train, Y_train)

RandomForestRegressor(n_jobs=-1)

In [28]:
def rmse():
    train_pred, test_pred  = model.predict(X_train), model.predict(X_test)
    train_mse, test_mse = mean_squared_error(train_pred, Y_train), mean_squared_error(test_pred, Y_test)
    return np.sqrt(train_mse), np.sqrt(test_mse) 

rmse()

(0.05427861580566846, 0.15703645389676105)

In [31]:

param_dist = {'objective':'reg:squarederror', 'n_estimators':100, 'reg_lambda': 3, 'max_depth':3, 'colsample_bynode': 0.2, }
model = XGBRegressor(**param_dist)
model.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_test, Y_test)], eval_metric="rmse")

[0]	validation_0-rmse:8.07415	validation_1-rmse:8.08828
[1]	validation_0-rmse:5.65679	validation_1-rmse:5.66786
[2]	validation_0-rmse:3.96522	validation_1-rmse:3.97187
[3]	validation_0-rmse:2.78180	validation_1-rmse:2.78211
[4]	validation_0-rmse:1.95386	validation_1-rmse:1.95524
[5]	validation_0-rmse:1.37493	validation_1-rmse:1.37416
[6]	validation_0-rmse:0.97144	validation_1-rmse:0.97055
[7]	validation_0-rmse:0.69156	validation_1-rmse:0.68881
[8]	validation_0-rmse:0.49882	validation_1-rmse:0.49665
[9]	validation_0-rmse:0.36650	validation_1-rmse:0.36838
[10]	validation_0-rmse:0.27741	validation_1-rmse:0.28400
[11]	validation_0-rmse:0.22026	validation_1-rmse:0.23210
[12]	validation_0-rmse:0.18445	validation_1-rmse:0.20229
[13]	validation_0-rmse:0.16164	validation_1-rmse:0.18381
[14]	validation_0-rmse:0.14871	validation_1-rmse:0.17525
[15]	validation_0-rmse:0.14068	validation_1-rmse:0.17069
[16]	validation_0-rmse:0.13550	validation_1-rmse:0.16672
[17]	validation_0-rmse:0.13239	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=0.2, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=0.3, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

# 11.7 model

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
import category_encoders as ce
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, PowerTransformer


from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
train_raw, test_raw = pd.read_csv('~/Downloads/house-pricing/train.csv'), pd.read_csv('~/Downloads/house-pricing/test.csv')

In [7]:

# Street, Utilities, Condition2, RoofMatl: bad repartition
# Alley, PoolQC: most are missing


# MasVnrArea, BsmtFinSF1, 2ndFlrSF, LowQualFinSF, PoolArea: Had a lot of 0.0
date_related = ['GarageYrBlt', 'MoSold', 'YrSold']
boolean = ['CentralAir']
target = 'SalePrice'
to_drop = ['Id', 'PoolArea', 'PoolQC', 'YearBuilt', 'YearRemodAdd']

numerical = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
             'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'OverallCond', 'OverallQual'] + date_related
categorical = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
              'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
              'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']  + boolean

assert len(categorical )+ len(numerical) + len(to_drop) + 1 == train_raw.shape[1]
assert set(categorical + numerical + date_related + boolean + [target] + to_drop) == set(train_raw.columns)

discrete = []
for var in numerical:
    if len(train_raw[var].unique())<20:
        discrete.append(var)
        

# Missing values:
mostly_missing = list()
treshold = 0.8
for var in train_raw.columns:
    missing_prc =  train_raw[var].isna().mean()
    if missing_prc > treshold:
        mostly_missing.append(var)
        

# Dist and outliers for continuous features
continuous = [var for var in numerical if var not in discrete and var not in ['Id', 'SalePrice']]

In [8]:
train = train_raw.copy()
# train['TotalArea'] = train['1stFlrSF'] + train['2ndFlrSF']
train['LivibleRatio'] = train['1stFlrSF'] / train['LotArea']
train['HasMasVnr'] = train['MasVnrArea'] == 0
train['BuildingAge'] = 2020 - train['YearBuilt']
train['LastRemodling'] = 2020 - train['YearRemodAdd']
train['HasPool'] = train['PoolArea'] > 0
train['BuiltPercentage'] = train['1stFlrSF'] / train['LotArea']
train['DiffBuitSold'] = train['YearBuilt'] - train['YrSold']
train['AvrRoomSF'] = train['TotRmsAbvGrd'] / (train['1stFlrSF'] + train['2ndFlrSF'])
train['2ndfloorPercentage'] = train['2ndFlrSF'] / train['1stFlrSF']
train['HasBasment'] = train['TotalBsmtSF'] > 0
train['DiffRemodBuilt'] = train['YearRemodAdd'] - train['YearBuilt']


# to_drop += ['YrSold', ]


# train['TotalArea'] = train['1stFlrSF'] + train['2ndFlrSF'] + train['OpenPorchSF']
# train['LivibleRatio'] = train['1stFlrSF'] / train['LotArea']
# train['HasMasVnr'] = train['MasVnrArea'] == 0
# train['BuildingAge'] = 2020 - train['YearBuilt']
# train['LastRemodling'] = 2020 - train['YearRemodAdd']


engineered = ['TotalArea', 'LivibleRatio', 'HasMasVnr', 'BuildingAge', 'LastRemodling', 'HasPool', 'BuiltPercentage', 'DiffBuitSold', 'AvrRoomSF',
             '2ndfloorPercentage', 'HasBasment', 'DiffRemodBuilt']

to_normalize = ['DiffBuitSold', 'BuildingAge', 'LastRemodling', 'DiffRemodBuilt']
train = train.drop(columns=to_drop, axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train, train.SalePrice, test_size=0.2,
                                                    random_state=0)
X_train.pop(target), X_test.pop(target)

y_train, y_test = np.log(y_train), np.log(y_test)

In [10]:
# Dealing with missing values in continuous
# If most of the values are 0, then create seperate boolean column to indicate if 0, and replace the 0 values with median, then apply transformation

drop = True
keep_outliers = True
sensitivity = 2.9
for var in continuous:
    
    if ((X_train[var] == 0).sum() / X_train.shape[0]) > 0.3:
        X_train[var + 'Is0'] = X_train[var] == 0
        X_test[var + 'Is0'] = X_test[var] == 0
        if drop:
            X_train.pop(var), X_test.pop(var)
            continue
        else:
            mean = X_train[var][X_train[var] != 0].mean()
            X_train.loc[:, var] = X_train.loc[:, var].replace(0, mean)
            X_test.loc[:, var] = X_test.loc[:, var].replace(0, mean)
        
    
    transformer = PowerTransformer()
    transformer.fit(X_train[[var]])
    X_train[var], X_test[var] = transformer.transform(X_train[[var]]).flatten(), transformer.transform(X_test[[var]]).flatten()

    if not keep_outliers:
        mean, std = X_train[var].mean(), X_train[var].std()
        z_score = (X_train[var] - mean) / std
        X_train[var] = np.clip(X_train[var], - sensitivity * z_score, sensitivity * z_score)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[var], X_test[var] = transformer.transform(X_train[[var]]).flatten(), transformer.transform(X_test[[var]]).flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[var], X_test[var] = transformer.transform(X_train[[var]]).flatten(), transformer.transform(X_test[[var]]).flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [11]:
# Missing values in continuous variables
# add variable indicating missingness + median imputation
for df in [X_train, X_test]:
    for var in ['LotFrontage', 'GarageYrBlt']:
        df[var+'_NA'] = np.where(df[var].isnull(), 1, 0)
        df[var].fillna(X_train[var].median(), inplace=True) 

for df in [X_train, X_test]:
    df.MasVnrArea.fillna(X_train.MasVnrArea.median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[var+'_NA'] = np.where(df[var].isnull(), 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


AttributeError: 'DataFrame' object has no attribute 'MasVnrArea'

In [12]:
# Missing values in categorical data

# Adding missing label
for df in [X_train, X_test]:
    for var in categorical:
        df[var].fillna('Missing', inplace=True)

# Adding rare label
# treshold = 0.05
# def rare_imputation(variable):
#     # find frequent labels / discrete numbers
#     temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
#     frequent_cat = [x for x in temp.loc[temp > treshold].index.values]
    
#     X_train.loc[:, variable] = np.where(X_train[variable].isin(frequent_cat), X_train[variable], 'Rare')
#     X_test.loc[:, variable] = np.where(X_test[variable].isin(frequent_cat), X_test[variable], 'Rare')
    
# # find unfrequent labels in categorical variables
# for var in categorical:
#     rare_imputation(var)


In [13]:
# Encoding
encoder = ce.TargetEncoder(cols=categorical)
encoder.fit(X_train, y_train)
X_train, X_test = encoder.transform(X_train), encoder.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [14]:
# Scaling
scaler = RobustScaler() # create an instance
scaler.fit(X_train)
X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [15]:
param_dist = {'objective':'reg:squarederror', 'n_estimators':1_000, 'reg_lambda': 0.4, 'max_depth':1, 'colsample_bynode': 0.2, }
model = XGBRegressor(**param_dist)
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric="rmse")


[0]	validation_0-rmse:8.07680	validation_1-rmse:8.08716
[1]	validation_0-rmse:5.65918	validation_1-rmse:5.66671
[2]	validation_0-rmse:3.96758	validation_1-rmse:3.97648
[3]	validation_0-rmse:2.78435	validation_1-rmse:2.79246
[4]	validation_0-rmse:1.95751	validation_1-rmse:1.96522
[5]	validation_0-rmse:1.38102	validation_1-rmse:1.38674
[6]	validation_0-rmse:0.97929	validation_1-rmse:0.98269
[7]	validation_0-rmse:0.70278	validation_1-rmse:0.70544
[8]	validation_0-rmse:0.51427	validation_1-rmse:0.51355
[9]	validation_0-rmse:0.38903	validation_1-rmse:0.38668
[10]	validation_0-rmse:0.30630	validation_1-rmse:0.30040
[11]	validation_0-rmse:0.25342	validation_1-rmse:0.24758
[12]	validation_0-rmse:0.22254	validation_1-rmse:0.21548
[13]	validation_0-rmse:0.20261	validation_1-rmse:0.19466
[14]	validation_0-rmse:0.19036	validation_1-rmse:0.18224
[15]	validation_0-rmse:0.18196	validation_1-rmse:0.17300
[16]	validation_0-rmse:0.17735	validation_1-rmse:0.16935
[17]	validation_0-rmse:0.17263	validation

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=0.2, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=1,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=0.4, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
class XGBModel:
    def __init__(self, drop_continous=False, keep_continous_outliers=False, outliers_sensitivity=2.8):
        self.drop_continuous = drop_continous
        self.keep_continuous_outliers = keep_continuous_outliers
        self.outliers_sensitivity = outliers_sensitivit
    
    def _continuous_engineering():
        drop = self.drop_continuous
        keep_outliers = self.keep_continuous_outliers
        sensitivity = 2.9
        for var in continuous:

            if ((X_train[var] == 0).sum() / X_train.shape[0]) > 0.3:
                X_train[var + 'Is0'] = X_train[var] == 0
                X_test[var + 'Is0'] = X_test[var] == 0
                if drop:
                    X_train.pop(var), X_test.pop(var)
                    continue
                else:
                    mean = X_train[var][X_train[var] != 0].mean()
                    X_train.loc[:, var] = X_train.loc[:, var].replace(0, mean)
                    X_test.loc[:, var] = X_test.loc[:, var].replace(0, mean)


            transformer = PowerTransformer()
            transformer.fit(X_train[[var]])
            X_train[var], X_test[var] = transformer.transform(X_train[[var]]).flatten(), transformer.transform(X_test[[var]]).flatten()

            if not keep_outliers:
                mean, std = X_train[var].mean(), X_train[var].std()
                z_score = (X_train[var] - mean) / std
                X_train[var] = np.clip(X_train[var], - sensitivity * z_score, sensitivity * z_score)
