In [2]:
import json
import numpy as np
import numpy as pn
import pandas as pd
import seaborn as sns
import sklearn.metrics
import xgboost as xgb

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from matplotlib import pyplot as plt
from numpyencoder import NumpyEncoder
from src.data.create_data import *
from src.features.create_features import *
from src.models.create_model import FitEstimator

In [3]:
train_features, target = load_train_data('../data/external/train.csv')
test_features = load_test_data('../data/external/test.csv')

In [4]:
cat = train_features.select_dtypes(include=object).columns
num = train_features.select_dtypes(include=[int, float]).columns
disc_cols = [*cat, 'mssubclass', 'overallqual', 'overallcond']

In [5]:
encode_ordinal_features(train_features)
freq_encode_nominal_features(train_features)
train_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,1151,65.0,8450,1454,1369,925,1311,1459,1052,...,0,0,-1,1179,1406,0,2,2008,1267,1198
2,20,1151,80.0,9600,1454,1369,925,1311,1459,47,...,0,0,-1,1179,1406,0,5,2007,1267,1198
3,60,1151,68.0,11250,1454,1369,484,1311,1459,1052,...,0,0,-1,1179,1406,0,9,2008,1267,1198
4,70,1151,60.0,9550,1454,1369,484,1311,1459,263,...,0,0,-1,1179,1406,0,2,2006,1267,101
5,60,1151,84.0,14260,1454,1369,484,1311,1459,47,...,0,0,-1,1179,1406,0,12,2008,1267,1198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,1151,62.0,7917,1454,1369,925,1311,1459,1052,...,0,0,-1,1179,1406,0,8,2007,1267,1198
1457,20,1151,85.0,13175,1454,1369,925,1311,1459,1052,...,0,0,-1,157,1406,0,2,2010,1267,1198
1458,70,1151,66.0,9042,1454,1369,925,1311,1459,1052,...,0,0,-1,59,49,2500,5,2010,1267,1198
1459,20,1151,68.0,9717,1454,1369,925,1311,1459,1052,...,0,0,-1,1179,1406,0,4,2010,1267,1198


In [6]:
encode_ordinal_features(test_features)
freq_encode_nominal_features(test_features)
test_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,10,80.0,11622,1453,1352,934,1311,1457,1081,...,120,0,-1,172,1408,0,6,2010,1258,1204
1462,20,1114,81.0,14267,1453,1352,484,1311,1457,248,...,0,0,-1,1169,3,12500,6,2010,1258,1204
1463,60,1114,74.0,13830,1453,1352,484,1311,1457,1081,...,0,0,-1,172,1408,0,3,2010,1258,1204
1464,60,1114,78.0,9978,1453,1352,484,1311,1457,1081,...,0,0,-1,1169,1408,0,6,2010,1258,1204
1465,120,1114,43.0,5005,1453,1352,484,70,1457,1081,...,144,0,-1,1169,1408,0,1,2010,1258,1204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,242,21.0,1936,1453,1352,934,1311,1457,1081,...,0,0,-1,1169,1408,0,6,2006,1258,1204
2916,160,242,21.0,1894,1453,1352,934,1311,1457,1081,...,0,0,-1,1169,1408,0,4,2006,1258,89
2917,20,1114,160.0,20000,1453,1352,934,1311,1457,1081,...,0,0,-1,1169,1408,0,9,2006,1258,89
2918,85,1114,62.0,10441,1453,1352,934,1311,1457,1081,...,0,0,-1,172,46,700,7,2006,1258,1204


In [7]:
train_features[num]

Unnamed: 0_level_0,mssubclass,lotfrontage,lotarea,overallqual,overallcond,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,...,garagearea,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,460,0,40,0,0,0,0,0,8,2007
1457,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,500,349,0,0,0,0,0,0,2,2010
1458,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,252,0,60,0,0,0,0,2500,5,2010
1459,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,240,366,0,112,0,0,0,0,4,2010


In [8]:
derived_features(train_features)
train_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,drivfeat1,drivfeat2,drivfeat3,drivfeat4,drivfeat5,drivfeat6,pcafeat1,pcafeat2,pcafeat3,pcafeat4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,1151,65.0,91.923882,1454,1369,925,1311,1459,1052,...,35,12,12,61,259.008989,524.587356,0.000000,0,259560,6009.0
2,20,1151,80.0,97.979590,1454,1369,925,1311,1459,47,...,48,12,9,298,1.084862,524.587356,0.000000,3,47080,5928.0
3,60,1151,68.0,106.066017,1454,1369,484,1311,1459,1052,...,35,12,12,42,259.008989,524.587356,0.000000,3,259560,6003.0
4,70,1151,60.0,97.724101,1454,1369,484,1311,1459,263,...,35,12,9,307,1.084862,426.857881,0.000000,4,7828,5994.0
5,60,1151,84.0,119.415242,1454,1369,484,1311,1459,47,...,40,12,12,276,259.008989,524.587356,0.000000,3,259560,6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,1151,62.0,88.977525,1454,1369,925,1311,1459,1052,...,30,12,9,40,1.084862,524.587356,0.000000,3,259560,5997.0
1457,20,1151,85.0,114.782403,1454,1369,925,1311,1459,1052,...,36,12,9,349,239.304688,524.587356,392.462963,6,15336,5934.0
1458,70,1151,66.0,95.089432,1454,1369,925,1311,1459,1052,...,63,12,20,60,1.084862,524.587356,0.000000,8,3660,5823.0
1459,20,1151,68.0,98.574845,1454,1369,925,1311,1459,1052,...,30,9,9,478,1.084862,524.587356,392.462963,0,47080,5850.0


In [9]:
derived_features(test_features)
test_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,drivfeat1,drivfeat2,drivfeat3,drivfeat4,drivfeat5,drivfeat6,pcafeat1,pcafeat2,pcafeat3,pcafeat4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,10,80.0,107.805380,1453,1352,934,1311,1457,1081,...,30,9,9,260,0.541387,527.256741,315.268293,0,260100,5883.0
1462,20,1114,81.0,119.444548,1453,1352,484,1311,1457,248,...,36,9,9,429,264.403226,527.256741,0.004850,0,39770,5874.0
1463,60,1114,74.0,117.601020,1453,1352,484,1311,1457,1081,...,25,12,9,246,0.541387,527.256741,0.004850,3,260100,5991.0
1464,60,1114,78.0,99.889939,1453,1352,484,1311,1457,1081,...,36,9,9,396,264.403226,527.256741,0.004850,4,260100,5994.0
1465,120,1114,43.0,70.746025,1453,1352,484,70,1457,1081,...,40,12,12,226,0.541387,527.256741,0.004850,0,43780,5976.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,242,21.0,44.000000,1453,1352,934,1311,1457,1081,...,28,9,9,0,0.541387,0.000000,0.004850,0,4290,1.0
2916,160,242,21.0,43.520110,1453,1352,934,1311,1457,1081,...,20,9,9,24,0.541387,353.833333,0.004850,0,4290,5910.0
2917,20,1114,160.0,141.421356,1453,1352,934,1311,1457,1081,...,35,9,9,474,0.541387,411.150510,0.004850,3,260100,5880.0
2918,85,1114,62.0,102.181212,1453,1352,934,1311,1457,1081,...,25,12,9,112,0.541387,0.000000,0.004850,0,9460,1.0


In [10]:
def arithmetic_features(df):
    df_c = df.copy()

    df_c['feat_1'] = df_c['bsmtfintype1'] / df_c['bsmtfintype2']
    df_c['feat_2'] = df_c['bsmtfinsf1'] / df_c['bsmtfinsf2']
    df_c['feat_3'] = df_c['bsmtunfsf'] + df_c['bsmtfinsf1'] + df_c['bsmtfinsf2']
    df_c['feat_4'] = df_c['garagefinish'] / df_c['garagearea']

    df_c['feat_5'] = np.sqrt(df_c[[col for col in df_c if 'sf' in col]].sum(axis=1))
    df_c['feat_6'] = df_c[[col for col in df_c if 'sf' in col]].sum(axis=1)
    df_c['feat_7'] = df_c[[col for col in df_c if 'abv' in col]].sum(axis=1)

    df_c['feat_8'] = df_c.groupby('bsmtexposure')['bsmtfinsf1'].transform('mean')
    df_c['feat_9'] = df_c.groupby('bsmtfintype1')['bsmtfinsf1'].transform('mean')
    df_c['feat_10'] = df_c.groupby('bsmtfintype1')['bsmtfinsf1'].transform('std')
    df_c['feat_11'] = df_c.groupby('bsmtfintype2')['bsmtfinsf2'].transform('std')
    df_c['feat_12'] = df_c.groupby('neighborhood')['garagearea'].transform('mean')
    df_c['feat_13'] = df_c.groupby('neighborhood')['garagecond'].transform('mean')
    df_c['feat_14'] = df_c.groupby('neighborhood')['garagequal'].transform('mean')

    df_c['feat_15'] = df_c.groupby('neighborhood')['lotfrontage'].transform('mean')
    df_c['feat_16'] = df_c.groupby('neighborhood')['overallqual'].transform('mean')
    df_c['feat_17'] = df_c.groupby('neighborhood')['overallcond'].transform('mean')
    df_c['feat_18'] = df_c.groupby('neighborhood')['lotarea'].transform('mean')
    df_c['feat_19'] = df_c.groupby('neighborhood')['propage'].transform('mean')
    df_c['feat_20'] = df_c.groupby('neighborhood')['modage'].transform('mean')
    df_c['feat_21'] = df_c.groupby('neighborhood')['feat_3'].transform('mean')

    return df_c

In [11]:
train_features = arithmetic_features(train_features)
train_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,feat_12,feat_13,feat_14,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,1151,65.0,91.923882,1454,1369,925,1311,1459,1052,...,23.143119,2.940000,2.953333,70.868514,6.640000,5.240000,97.307794,9.920000,8.666667,1157.586667
2,20,1151,80.0,97.979590,1454,1369,925,1311,1459,47,...,23.280424,3.000000,3.000000,61.532512,6.727273,6.272727,119.927309,24.636364,17.181818,1321.272727
3,60,1151,68.0,106.066017,1454,1369,484,1311,1459,1052,...,23.143119,2.940000,2.953333,70.868514,6.640000,5.240000,97.307794,9.920000,8.666667,1157.586667
4,70,1151,60.0,97.724101,1454,1369,484,1311,1459,263,...,19.698430,2.882353,2.862745,72.297417,6.274510,6.588235,106.167553,65.941176,28.294118,1032.901961
5,60,1151,84.0,119.415242,1454,1369,484,1311,1459,47,...,26.413201,3.024390,3.024390,87.166926,7.926829,5.219512,116.713574,12.390244,11.170732,1493.756098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,1151,62.0,88.977525,1454,1369,925,1311,1459,1052,...,21.188152,3.000000,3.000000,76.922746,6.556962,5.126582,104.985862,9.303797,8.734177,869.025316
1457,20,1151,85.0,114.782403,1454,1369,925,1311,1459,1052,...,23.045718,3.000000,3.027397,77.281194,6.328767,5.945205,108.177926,32.191781,26.301370,1153.835616
1458,70,1151,66.0,95.089432,1454,1369,925,1311,1459,1052,...,19.698430,2.882353,2.862745,72.297417,6.274510,6.588235,106.167553,65.941176,28.294118,1032.901961
1459,20,1151,68.0,98.574845,1454,1369,925,1311,1459,1052,...,20.099080,2.902222,2.902222,75.643985,5.360000,5.791111,99.688647,47.871111,36.244444,1026.168889


In [12]:
test_features = arithmetic_features(test_features)
test_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,feat_12,feat_13,feat_14,feat_15,feat_16,feat_17,feat_18,feat_19,feat_20,feat_21
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,10,80.0,107.805380,1453,1352,934,1311,1457,1081,...,19.992988,2.922018,2.917431,73.266354,5.307339,5.834862,98.748300,47.963303,38.064220,1036.550459
1462,20,1114,81.0,119.444548,1453,1352,484,1311,1457,248,...,19.992988,2.922018,2.917431,73.266354,5.307339,5.834862,98.748300,47.963303,38.064220,1036.550459
1463,60,1114,74.0,117.601020,1453,1352,484,1311,1457,1081,...,21.243621,3.000000,3.000000,69.443080,6.476744,5.058140,104.086420,9.476744,8.674419,881.069767
1464,60,1114,78.0,99.889939,1453,1352,484,1311,1457,1081,...,21.243621,3.000000,3.000000,69.443080,6.476744,5.058140,104.086420,9.476744,8.674419,881.069767
1465,120,1114,43.0,70.746025,1453,1352,484,70,1457,1081,...,24.762729,3.000000,3.038462,61.769231,8.576923,5.000000,104.637114,6.884615,6.653846,1597.461538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,242,21.0,44.000000,1453,1352,934,1311,1457,1081,...,11.303202,1.550000,1.600000,28.070975,4.300000,5.700000,43.406838,35.550000,32.050000,574.950000
2916,160,242,21.0,43.520110,1453,1352,934,1311,1457,1081,...,11.303202,1.550000,1.600000,28.070975,4.300000,5.700000,43.406838,35.550000,32.050000,574.950000
2917,20,1114,160.0,141.421356,1453,1352,934,1311,1457,1081,...,21.857507,2.784615,2.800000,77.018158,5.507692,5.507692,108.096275,25.861538,22.153846,1067.338462
2918,85,1114,62.0,102.181212,1453,1352,934,1311,1457,1081,...,21.857507,2.784615,2.800000,77.018158,5.507692,5.507692,108.096275,25.861538,22.153846,1067.338462


In [13]:
train_features.replace([np.inf, -np.inf], np.nan, inplace=True)
train_features.replace([np.inf, -np.inf], np.nan, inplace=True)

train_features.fillna(0, inplace=True)
test_features.fillna(0, inplace=True)

In [14]:
mask = []

for t in train_features.columns:
    if t in disc_cols:
        mask.append(True)
    else:
        mask.append(False)

score_mi = mi_score(train_features, target, mask)
score_mi

overallqual     5.727079e-01
neighborhood    4.959247e-01
grlivarea       4.887517e-01
feat_16         4.730256e-01
feat_4          4.704703e-01
                    ...     
utilities       1.887379e-15
feat_2          0.000000e+00
mosold          0.000000e+00
yrsold          0.000000e+00
bsmtfinsf2      0.000000e+00
Length: 117, dtype: float64

In [15]:
cols_to_trans = ['lotfrontage', 'lotarea', 'masvnrarea',
                 'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf',
                 'totalbsmtsf', '1stflrsf', '2ndflrsf',
                 'lowqualfinsf', 'grlivarea', 'garagearea',
                 'wooddecksf', 'screenporch', 'enclosedporch',
                 'openporchsf', 'propage', 'modage', 'timetomod',
                 'drivfeat4', 'pcafeat4']
trans = train_features[cols_to_trans]

In [16]:
q_transformer = quantile_transformation(features=trans, output_distribution='normal',
                                        n_quantiles=1000)
q_transformer

QuantileTransformer(output_distribution='normal')

In [17]:
train_features[cols_to_trans] = q_transformer.transform(trans)
test_features[cols_to_trans] = q_transformer.transform(test_features[cols_to_trans])

In [18]:
least_informative = score_mi[score_mi == 0].index.tolist()

final_train = train_features.drop(least_informative,
                                  axis=1)
final_test = test_features.drop(least_informative,
                                axis=1)

In [19]:
trainX, validX, trainY, validY = train_test_split(final_train, target, test_size=0.2, random_state=48)
trainX.shape, validX.shape, trainY.shape, validY.shape

((1168, 113), (292, 113), (1168,), (292,))

In [43]:
estimator = xgb.XGBRegressor(learning_rate=0.01)

paraDis = {'max_depth': np.random.randint(1, 21, 10),
           'n_estimators': np.random.randint(1000, 4000, 10),
           'min_child_weight': range(1, 10),
           'colsample_bytree': np.random.uniform(0, 1, 10),
           'subsample': np.random.uniform(0, 1, 10),
           }

fitter = FitEstimator(estimator, random_state=48)

In [44]:
search = fitter.random_search(param_distributions=paraDis,
                              n_iter=10,
                              cv=3,
                              scoring=make_scorer(mean_squared_error, greater_is_better=False),
                              verbose=1)

In [45]:
search.fit(trainX, trainY)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=0.01,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                        'max_depth': array([10,  2,  5, 11,  6, 20, 16,  3, 19,  8]),
                                        'min_child_weight': range(1, 10),
               

In [46]:
search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.30616383334574826,
             enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1697, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.2255137388719427,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [47]:
model = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                         colsample_bynode=1, colsample_bytree=0.30616383334574826,
                         enable_categorical=False, gamma=0, gpu_id=-1, importance_type=None,
                         interaction_constraints='', learning_rate=0.01, max_delta_step=0,
                         max_depth=10, min_child_weight=1, missing=np.nan,
                         monotone_constraints='()', n_estimators=1697, n_jobs=8,
                         num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
                         reg_lambda=1, scale_pos_weight=1, subsample=0.2255137388719427,
                         tree_method='exact', validate_parameters=1, verbosity=None)

In [49]:
i = 0
models = []
kfold = KFold(n_splits=9, shuffle=True, random_state=48)

for t, v in kfold.split(trainX, trainY):
    model.fit(trainX.iloc[t], trainY.iloc[t])
    train_score = mean_squared_error(np.log(trainY.iloc[t]), np.log(model.predict(trainX.iloc[t])), squared=False)
    valid_score = mean_squared_error(np.log(trainY.iloc[v]), np.log(model.predict(trainX.iloc[v])), squared=False)

    i+=1
    print(f'{i} -- complete')

    models.append([model, train_score, valid_score])

1 -- complete
2 -- complete
3 -- complete
4 -- complete
5 -- complete
6 -- complete
7 -- complete
8 -- complete
9 -- complete


In [50]:
target_t = np.zeros((validX.shape[0]))

for i in models:
    p = i[0].predict(validX)
    target_t += p

target_t = target_t / 9

In [51]:
mean_squared_error(np.log(validY), np.log(target_t), squared=False)

0.12621253703308455

In [52]:
target = np.zeros((final_test.shape[0]))

for i in models:
    p = i[0].predict(final_test)
    target += p

target = target / 9
target

array([123992.2734375, 163584.421875 , 185800.03125  , ...,
       226028.75     , 116893.4375   , 215290.078125 ])

In [53]:
pd.DataFrame(target, columns=['SalePrice'], index=final_test.index).reset_index().to_csv(
    '../data/submission/submission_5_1.csv', index=False)