In [1]:
import json
import numpy as np
import numpy as pn
import pandas as pd
import seaborn as sns
import sklearn.metrics
import xgboost as xgb
import pickle

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from matplotlib import pyplot as plt
from numpyencoder import NumpyEncoder
from src.data.create_data import *
from src.features.create_features import *
from src.models.create_model import FitEstimator

In [2]:
train_features, target = load_train_data('../data/external/train.csv')
test_features = load_test_data('../data/external/test.csv')

In [3]:
train_features.shape, target.shape

((1460, 79), (1460,))

In [4]:
cat = train_features.select_dtypes(include=object).columns
num = train_features.select_dtypes(include=[int, float]).columns
disc_cols = [*cat, 'mssubclass', 'overallqual', 'overallcond']

In [5]:
encode_ordinal_features(train_features)
freq_encode_nominal_features(train_features)
train_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,1151,65.0,8450,1454,1369,925,1311,1459,1052,...,0,0,-1,1179,1406,0,2,2008,1267,1198
2,20,1151,80.0,9600,1454,1369,925,1311,1459,47,...,0,0,-1,1179,1406,0,5,2007,1267,1198
3,60,1151,68.0,11250,1454,1369,484,1311,1459,1052,...,0,0,-1,1179,1406,0,9,2008,1267,1198
4,70,1151,60.0,9550,1454,1369,484,1311,1459,263,...,0,0,-1,1179,1406,0,2,2006,1267,101
5,60,1151,84.0,14260,1454,1369,484,1311,1459,47,...,0,0,-1,1179,1406,0,12,2008,1267,1198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,1151,62.0,7917,1454,1369,925,1311,1459,1052,...,0,0,-1,1179,1406,0,8,2007,1267,1198
1457,20,1151,85.0,13175,1454,1369,925,1311,1459,1052,...,0,0,-1,157,1406,0,2,2010,1267,1198
1458,70,1151,66.0,9042,1454,1369,925,1311,1459,1052,...,0,0,-1,59,49,2500,5,2010,1267,1198
1459,20,1151,68.0,9717,1454,1369,925,1311,1459,1052,...,0,0,-1,1179,1406,0,4,2010,1267,1198


In [6]:
encode_ordinal_features(test_features)
freq_encode_nominal_features(test_features)
test_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,10,80.0,11622,1453,1352,934,1311,1457,1081,...,120,0,-1,172,1408,0,6,2010,1258,1204
1462,20,1114,81.0,14267,1453,1352,484,1311,1457,248,...,0,0,-1,1169,3,12500,6,2010,1258,1204
1463,60,1114,74.0,13830,1453,1352,484,1311,1457,1081,...,0,0,-1,172,1408,0,3,2010,1258,1204
1464,60,1114,78.0,9978,1453,1352,484,1311,1457,1081,...,0,0,-1,1169,1408,0,6,2010,1258,1204
1465,120,1114,43.0,5005,1453,1352,484,70,1457,1081,...,144,0,-1,1169,1408,0,1,2010,1258,1204
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,242,21.0,1936,1453,1352,934,1311,1457,1081,...,0,0,-1,1169,1408,0,6,2006,1258,1204
2916,160,242,21.0,1894,1453,1352,934,1311,1457,1081,...,0,0,-1,1169,1408,0,4,2006,1258,89
2917,20,1114,160.0,20000,1453,1352,934,1311,1457,1081,...,0,0,-1,1169,1408,0,9,2006,1258,89
2918,85,1114,62.0,10441,1453,1352,934,1311,1457,1081,...,0,0,-1,172,46,700,7,2006,1258,1204


In [7]:
train_features[num]

Unnamed: 0_level_0,mssubclass,lotfrontage,lotarea,overallqual,overallcond,yearbuilt,yearremodadd,masvnrarea,bsmtfinsf1,bsmtfinsf2,...,garagearea,wooddecksf,openporchsf,enclosedporch,3ssnporch,screenporch,poolarea,miscval,mosold,yrsold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,548,0,61,0,0,0,0,0,2,2008
2,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,460,298,0,0,0,0,0,0,5,2007
3,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,608,0,42,0,0,0,0,0,9,2008
4,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,642,0,35,272,0,0,0,0,2,2006
5,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,836,192,84,0,0,0,0,0,12,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,460,0,40,0,0,0,0,0,8,2007
1457,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,500,349,0,0,0,0,0,0,2,2010
1458,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,252,0,60,0,0,0,0,2500,5,2010
1459,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,240,366,0,112,0,0,0,0,4,2010


In [8]:
derived_features(train_features)
train_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,drivfeat1,drivfeat2,drivfeat3,drivfeat4,drivfeat5,drivfeat6,pcafeat1,pcafeat2,pcafeat3,pcafeat4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,1151,65.0,91.923882,1454,1369,925,1311,1459,1052,...,35,12,12,61,259.008989,524.587356,0.000000,0,259560,6009.0
2,20,1151,80.0,97.979590,1454,1369,925,1311,1459,47,...,48,12,9,298,1.084862,524.587356,0.000000,3,47080,5928.0
3,60,1151,68.0,106.066017,1454,1369,484,1311,1459,1052,...,35,12,12,42,259.008989,524.587356,0.000000,3,259560,6003.0
4,70,1151,60.0,97.724101,1454,1369,484,1311,1459,263,...,35,12,9,307,1.084862,426.857881,0.000000,4,7828,5994.0
5,60,1151,84.0,119.415242,1454,1369,484,1311,1459,47,...,40,12,12,276,259.008989,524.587356,0.000000,3,259560,6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,1151,62.0,88.977525,1454,1369,925,1311,1459,1052,...,30,12,9,40,1.084862,524.587356,0.000000,3,259560,5997.0
1457,20,1151,85.0,114.782403,1454,1369,925,1311,1459,1052,...,36,12,9,349,239.304688,524.587356,392.462963,6,15336,5934.0
1458,70,1151,66.0,95.089432,1454,1369,925,1311,1459,1052,...,63,12,20,60,1.084862,524.587356,0.000000,8,3660,5823.0
1459,20,1151,68.0,98.574845,1454,1369,925,1311,1459,1052,...,30,9,9,478,1.084862,524.587356,392.462963,0,47080,5850.0


In [9]:
derived_features(test_features)
test_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,drivfeat1,drivfeat2,drivfeat3,drivfeat4,drivfeat5,drivfeat6,pcafeat1,pcafeat2,pcafeat3,pcafeat4
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,10,80.0,107.805380,1453,1352,934,1311,1457,1081,...,30,9,9,260,0.541387,527.256741,315.268293,0,260100,5883.0
1462,20,1114,81.0,119.444548,1453,1352,484,1311,1457,248,...,36,9,9,429,264.403226,527.256741,0.004850,0,39770,5874.0
1463,60,1114,74.0,117.601020,1453,1352,484,1311,1457,1081,...,25,12,9,246,0.541387,527.256741,0.004850,3,260100,5991.0
1464,60,1114,78.0,99.889939,1453,1352,484,1311,1457,1081,...,36,9,9,396,264.403226,527.256741,0.004850,4,260100,5994.0
1465,120,1114,43.0,70.746025,1453,1352,484,70,1457,1081,...,40,12,12,226,0.541387,527.256741,0.004850,0,43780,5976.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,242,21.0,44.000000,1453,1352,934,1311,1457,1081,...,28,9,9,0,0.541387,0.000000,0.004850,0,4290,1.0
2916,160,242,21.0,43.520110,1453,1352,934,1311,1457,1081,...,20,9,9,24,0.541387,353.833333,0.004850,0,4290,5910.0
2917,20,1114,160.0,141.421356,1453,1352,934,1311,1457,1081,...,35,9,9,474,0.541387,411.150510,0.004850,3,260100,5880.0
2918,85,1114,62.0,102.181212,1453,1352,934,1311,1457,1081,...,25,12,9,112,0.541387,0.000000,0.004850,0,9460,1.0


In [10]:
def arithmetic_features(df):
    df_c = df.copy()

    df_c['feat_1'] = df_c['bsmtfintype1'] / df_c['bsmtfintype2']
    df_c['feat_2'] = df_c['bsmtfinsf1'] / df_c['bsmtfinsf2']
    df_c['feat_3'] = df_c['bsmtunfsf'] + df_c['bsmtfinsf1'] + df_c['bsmtfinsf2']
    df_c['feat_4'] = df_c['garagefinish'] / df_c['garagearea']

    df_c['feat_5'] = np.sqrt(df_c[[col for col in df_c if 'sf' in col]].sum(axis=1))
    df_c['feat_6'] = df_c[[col for col in df_c if 'sf' in col]].sum(axis=1)
    df_c['feat_7'] = df_c[[col for col in df_c if 'abv' in col]].sum(axis=1)

    df_c['feat_8'] = df_c.groupby('bsmtexposure')['bsmtfinsf1'].transform('mean')
    df_c['feat_9'] = df_c.groupby('bsmtfintype1')['bsmtfinsf1'].transform('mean')
    df_c['feat_10'] = df_c.groupby('bsmtfintype1')['bsmtfinsf1'].transform('std')
    df_c['feat_11'] = df_c.groupby('bsmtfintype2')['bsmtfinsf2'].transform('std')
    df_c['feat_12'] = df_c.groupby('neighborhood')['garagearea'].transform('mean')
    df_c['feat_12_1'] = df_c.groupby('overallqual')['garagearea'].transform('mean')
    df_c['feat_13'] = df_c.groupby('neighborhood')['garagecond'].transform('mean')
    df_c['feat_13_1'] = df_c.groupby('overallqual')['garagecond'].transform('mean')
    df_c['feat_14'] = df_c.groupby('neighborhood')['garagequal'].transform('mean')
    df_c['feat_14_1'] = df_c.groupby('overallqual')['garagequal'].transform('mean')

    df_c['feat_15'] = df_c.groupby('neighborhood')['lotfrontage'].transform('mean')
    df_c['feat_15_1'] = df_c.groupby('overallqual')['lotfrontage'].transform('mean')
    df_c['feat_16'] = df_c.groupby('neighborhood')['overallqual'].transform('mean')
    df_c['feat_16_1'] = df_c.groupby('overallqual')['overallqual'].transform('mean')
    df_c['feat_17'] = df_c.groupby('neighborhood')['overallcond'].transform('mean')
    df_c['feat_17_1'] = df_c.groupby('overallqual')['overallcond'].transform('mean')
    df_c['feat_18'] = df_c.groupby('neighborhood')['lotarea'].transform('mean')
    df_c['feat_18_1'] = df_c.groupby('overallqual')['lotarea'].transform('mean')
    df_c['feat_19'] = df_c.groupby('neighborhood')['propage'].transform('mean')
    df_c['feat_19_1'] = df_c.groupby('overallqual')['propage'].transform('mean')
    df_c['feat_20'] = df_c.groupby('neighborhood')['modage'].transform('mean')
    df_c['feat_20_1'] = df_c.groupby('overallqual')['modage'].transform('mean')
    df_c['feat_21'] = df_c.groupby('neighborhood')['feat_3'].transform('mean')
    df_c['feat_21_1'] = df_c.groupby('overallqual')['feat_3'].transform('mean')

    df_c['feat_22'] = df_c.groupby('yearbuilt')['lotfrontage'].transform('mean')
    df_c['feat_22_1'] = df_c.groupby('yearbuilt')['lotarea'].transform('mean')

    return df_c

In [11]:
train_features = arithmetic_features(train_features)
train_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,feat_18,feat_18_1,feat_19,feat_19_1,feat_20,feat_20_1,feat_21,feat_21_1,feat_22,feat_22_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,1151,65.0,91.923882,1454,1369,925,1311,1459,1052,...,97.307794,99.493379,9.920000,19.517241,8.666667,11.460815,1157.586667,1108.514107,68.547956,94.027019
2,20,1151,80.0,97.979590,1454,1369,925,1311,1459,47,...,119.927309,96.044816,24.636364,40.219251,17.181818,26.459893,1321.272727,983.526738,68.113382,93.271920
3,60,1151,68.0,106.066017,1454,1369,484,1311,1459,1052,...,97.307794,99.493379,9.920000,19.517241,8.666667,11.460815,1157.586667,1108.514107,68.191601,100.865613
4,70,1151,60.0,97.724101,1454,1369,484,1311,1459,263,...,106.167553,99.493379,65.941176,19.517241,28.294118,11.460815,1032.901961,1108.514107,68.400000,93.539145
5,60,1151,84.0,119.415242,1454,1369,484,1311,1459,47,...,116.713574,103.063223,12.390244,10.886905,11.170732,6.464286,1493.756098,1417.886905,69.009457,88.123110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,1151,62.0,88.977525,1454,1369,925,1311,1459,1052,...,104.985862,96.044816,9.303797,40.219251,8.734177,26.459893,869.025316,983.526738,57.914101,83.904488
1457,20,1151,85.0,114.782403,1454,1369,925,1311,1459,1052,...,108.177926,96.044816,32.191781,40.219251,26.301370,26.459893,1153.835616,983.526738,64.094945,96.042650
1458,70,1151,66.0,95.089432,1454,1369,925,1311,1459,1052,...,106.167553,99.493379,65.941176,19.517241,28.294118,11.460815,1032.901961,1108.514107,69.917032,91.768164
1459,20,1151,68.0,98.574845,1454,1369,925,1311,1459,1052,...,99.688647,97.353790,47.871111,53.042821,36.244444,32.518892,1026.168889,929.539043,67.801990,95.145714


In [12]:
test_features = arithmetic_features(test_features)
test_features

Unnamed: 0_level_0,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,...,feat_18,feat_18_1,feat_19,feat_19_1,feat_20,feat_20_1,feat_21,feat_21_1,feat_22,feat_22_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,10,80.0,107.805380,1453,1352,934,1311,1457,1081,...,98.748300,97.217518,47.963303,52.331776,38.064220,35.023364,1036.550459,936.432243,82.181023,103.822566
1462,20,1114,81.0,119.444548,1453,1352,484,1311,1457,248,...,98.748300,94.358162,47.963303,39.532213,38.064220,26.893557,1036.550459,966.414566,71.777155,101.258053
1463,60,1114,74.0,117.601020,1453,1352,484,1311,1457,1081,...,104.086420,97.217518,9.476744,52.331776,8.674419,35.023364,881.069767,936.432243,71.228251,95.834676
1464,60,1114,78.0,99.889939,1453,1352,484,1311,1457,1081,...,104.086420,94.358162,9.476744,39.532213,8.674419,26.893557,881.069767,966.414566,72.646104,99.852799
1465,120,1114,43.0,70.746025,1453,1352,484,70,1457,1081,...,104.637114,100.553621,6.884615,9.149425,6.653846,6.166667,1597.461538,1407.264368,69.546551,96.049347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,242,21.0,44.000000,1453,1352,934,1311,1457,1081,...,43.406838,86.124544,35.550000,63.945455,32.050000,40.790909,574.950000,709.545455,44.972390,63.991154
2916,160,242,21.0,43.520110,1453,1352,934,1311,1457,1081,...,43.406838,86.124544,35.550000,63.945455,32.050000,40.790909,574.950000,709.545455,44.972390,63.991154
2917,20,1114,160.0,141.421356,1453,1352,934,1311,1457,1081,...,108.096275,97.217518,25.861538,52.331776,22.153846,35.023364,1067.338462,936.432243,79.686762,106.194191
2918,85,1114,62.0,102.181212,1453,1352,934,1311,1457,1081,...,108.096275,97.217518,25.861538,52.331776,22.153846,35.023364,1067.338462,936.432243,69.546551,96.049347


In [13]:
train_features.replace([np.inf, -np.inf], np.nan, inplace=True)
test_features.replace([np.inf, -np.inf], np.nan, inplace=True)

train_features.fillna(0, inplace=True)
test_features.fillna(0, inplace=True)

In [14]:
mask = []

for t in train_features.columns:
    if t in disc_cols:
        mask.append(True)
    else:
        mask.append(False)

score_mi = mi_score(train_features, target, mask)
score_mi

overallqual     0.573137
feat_18_1       0.569403
feat_21_1       0.568510
feat_20_1       0.566330
feat_16_1       0.562123
                  ...   
feat_2          0.000000
poolqc          0.000000
poolarea        0.000000
lowqualfinsf    0.000000
mosold          0.000000
Length: 129, dtype: float64

In [15]:
high_mi = score_mi[:26].index.tolist()

In [16]:
corr_ = {}

for col in high_mi:
    corr_[col] = (np.corrcoef(train_features[col], target)[0, 1])

corr_

{'overallqual': 0.7909816005838052,
 'feat_18_1': 0.7378689651969845,
 'feat_21_1': 0.8120392658665697,
 'feat_20_1': -0.7500935784658546,
 'feat_16_1': 0.7909816005838052,
 'feat_15_1': 0.7347965234027727,
 'feat_19_1': -0.7410835287813533,
 'feat_12_1': 0.7836259037514758,
 'feat_17_1': -0.3657736730097563,
 'feat_14_1': 0.5446019455086608,
 'feat_13_1': 0.519635496035474,
 'neighborhood': -0.19620355518024993,
 'grlivarea': 0.7087645388955903,
 'feat_4': -0.35635608228244897,
 'feat_12': 0.6159344600563479,
 'feat_16': 0.6726943942304486,
 'feat_15': 0.3422876582474918,
 'feat_21': 0.6349907514594413,
 'feat_20': -0.5481790187526568,
 'drivfeat1': 0.5652938448019195,
 'feat_19': -0.4821895367509899,
 'feat_17': -0.3538337218580938,
 'feat_18': 0.2802663739404662,
 'feat_13': 0.4291057424078436,
 'feat_5': 0.6272389123171918,
 'feat_6': 0.6634130915470114}

In [17]:
cols_to_trans = ['lotfrontage', 'lotarea', 'masvnrarea',
                 'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf',
                 'totalbsmtsf', '1stflrsf', '2ndflrsf',
                 'lowqualfinsf', 'grlivarea', 'garagearea',
                 'wooddecksf', 'screenporch', 'enclosedporch',
                 'openporchsf', 'propage', 'modage', 'timetomod',
                 'drivfeat4', 'pcafeat4']

trans = train_features[cols_to_trans]

In [18]:
q_transformer = quantile_transformation(features=trans, output_distribution='normal',
                                        n_quantiles=1000)
q_transformer

In [19]:
train_features[cols_to_trans] = q_transformer.fit_transform(trans)
test_features[cols_to_trans] = q_transformer.transform(test_features[cols_to_trans])

In [20]:
# save processed data
with open('../data/processed/train.pkl', 'wb') as file:
    pd.merge(train_features,
             pd.Series(target),
             how='inner',
             right_index=True,
             left_index=True).to_pickle(file)

with open('../data/processed/test.pkl', 'wb') as file:
    test_features.to_pickle(file)

In [21]:
least_informative = score_mi[score_mi == 0].index.tolist()

final_train = train_features.drop(least_informative,
                                  axis=1)
final_test = test_features.drop(least_informative,
                                axis=1)

In [22]:
trainX, validX, trainY, validY = train_test_split(final_train, target, test_size=0.2, random_state=48)
trainX.shape, validX.shape, trainY.shape, validY.shape

((1168, 122), (292, 122), (1168,), (292,))

In [23]:
estimator = xgb.XGBRegressor(learning_rate=0.01)

paraDis = {'max_depth': np.random.randint(1, 21, 10),
           'n_estimators': np.random.randint(1000, 4000, 10),
           'min_child_weight': range(1, 10),
           'colsample_bytree': np.random.uniform(0, 1, 10),
           'subsample': np.random.uniform(0, 1, 10),
           }

fitter = FitEstimator(estimator, random_state=48)

In [24]:
search = fitter.random_search(param_distributions=paraDis,
                              n_iter=25,
                              cv=3,
                              scoring=make_scorer(mean_squared_error, greater_is_better=False),
                              verbose=1)

In [25]:
search.fit(trainX, trainY)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [26]:
search.best_params_

{'subsample': 0.3227908605588724,
 'n_estimators': 1785,
 'min_child_weight': 7,
 'max_depth': 20,
 'colsample_bytree': 0.4633756731718641}

In [27]:
model = xgb.XGBRegressor(learning_rate=0.01,
                         subsample=0.3227908605588724,
                         n_estimators=1785,
                         min_child_weight=7,
                         max_depth=20,
                         colsample_bytree=0.4633756731718641)

In [41]:
model.fit(trainX, trainY)

with open('../model/xbg_reg_v2.pkl', 'wb') as file:
    pickle.dump(model, file)

In [28]:
i = 0
models = []
kfold = KFold(n_splits=9, shuffle=True, random_state=48)

for t, v in kfold.split(trainX, trainY):
    model.fit(trainX.iloc[t], trainY.iloc[t])
    train_score = mean_squared_error(np.log(trainY.iloc[t]), np.log(model.predict(trainX.iloc[t])), squared=False)
    valid_score = mean_squared_error(np.log(trainY.iloc[v]), np.log(model.predict(trainX.iloc[v])), squared=False)

    i += 1
    print(f'{i} -- train score:{train_score} -- validation score:{valid_score}-- complete')

    models.append([model, train_score, valid_score])

1 -- train score:0.04218789922579955 -- validation score:0.12698517088258124-- complete
2 -- train score:0.04194420467002328 -- validation score:0.12039740454019082-- complete
3 -- train score:0.0320988957898333 -- validation score:0.1505269603511499-- complete
4 -- train score:0.04194956762545456 -- validation score:0.11579608800452901-- complete
5 -- train score:0.04110829705238722 -- validation score:0.10282860098969857-- complete
6 -- train score:0.04146482277837838 -- validation score:0.09168640683061599-- complete
7 -- train score:0.04011996170610958 -- validation score:0.11772181402518442-- complete
8 -- train score:0.0404528368055187 -- validation score:0.13006810778209463-- complete
9 -- train score:0.038533769210921394 -- validation score:0.15071230438776137-- complete


In [29]:
target_t = np.zeros((validX.shape[0]))

for i in models:
    p = i[0].predict(validX)
    target_t += p

target_t = target_t / 9

In [30]:
mean_squared_error(np.log(validY), np.log(target_t), squared=False)

0.13010283281798274

In [31]:
model.fit(trainX, trainY)

In [31]:
target = np.zeros((final_test.shape[0]))

for i in models:
    p = i[0].predict(final_test)
    target += p

target = target / 9
target

array([122917.8359375, 160221.890625 , 183111.109375 , ...,
       234829.59375  , 119948.9296875, 222015.953125 ])

In [32]:
target = model.predict(final_test)

In [33]:
pd.DataFrame(target, columns=['SalePrice'], index=final_test.index).reset_index().to_csv(
    '../data/submission/submission_5_5.csv', index=False)