In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tnrange, tqdm_notebook
import gc

In [2]:
sns.set_context('talk')

In [3]:
import warnings
warnings.filterwarnings('ignore', message='Changing the shape of non-C contiguous array')

# Read the data

In [32]:
dfXtrain = pd.read_csv('preprocessed_csv/train.csv', index_col='id', sep=';')
dfXtest = pd.read_csv('preprocessed_csv/test.csv', index_col='id', sep=';')
dfYtrain = pd.read_csv('preprocessed_csv/y_train.csv', header=None, names=['ID', 'COTIS'], sep=';')

# Preprocessing

Encode categorical (simple)

In [33]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [34]:
categorical_cols = list(dfXtrain.select_dtypes(exclude=numerics).columns)

In [35]:
categorical_cols

['marque', 'energie_veh', 'profession', 'var6', 'var8', 'var14']

In [36]:
list(dfXtest.select_dtypes(exclude=numerics).columns)

['marque', 'energie_veh', 'profession', 'var6', 'var8', 'var14']

In [37]:
from sklearn.preprocessing import LabelEncoder

encoder = dict()

for col in categorical_cols:
    print(col)
    dfXtrain[col].fillna('MISSING', inplace=True)
    dfXtest[col].fillna('MISSING', inplace=True)
    big_col = pd.concat([dfXtrain[col], dfXtest[col]])
    encoder[col] = LabelEncoder().fit(big_col)
    dfXtrain[col] = encoder[col].transform(dfXtrain[col])
    dfXtest[col] = encoder[col].transform(dfXtest[col])


marque
energie_veh
profession
var6
var8
var14


Заполняем пропуски в численых

In [38]:
dfXtrain.fillna(-9999, inplace=True)
dfXtest.fillna(-9999, inplace=True)

Делим выборку на train и validation. В validation все с индексами на 10\* и 971\*, в train всё остальное. (10\* и 971\* встречаются в тесте)

In [55]:
x_validation = np.array(dfXtrain[(dfXtrain.codepostal // 1000 == 10) | (dfXtrain.codepostal // 100 == 971)])
x_train = np.array(dfXtrain[(dfXtrain.codepostal // 1000 != 10) & (dfXtrain.codepostal // 100 != 971)])
x_bigtrain = np.array(dfXtrain)

x_test = np.array(dfXtest)

y_validation = np.array(
    dfYtrain.set_index('ID')['COTIS'][(dfXtrain.codepostal // 1000 == 10) | (dfXtrain.codepostal // 100 == 971)])
y_train = np.array(
    dfYtrain.set_index('ID')['COTIS'][(dfXtrain.codepostal // 1000 != 10) & (dfXtrain.codepostal // 100 != 971)])

In [127]:
y_bigtrain = np.array(dfYtrain['COTIS'])

# Save routines

In [133]:
dfYtest = pd.DataFrame({'ID': dfXtest.index, 'COTIS': np.zeros(x_test.shape[0])})
dfYtest = dfYtest[['ID', 'COTIS']]
dfYtest.head()

Unnamed: 0,ID,COTIS
0,300001,0.0
1,300002,0.0
2,300003,0.0
3,300004,0.0
4,300005,0.0


In [134]:
def save_to_file(y, file_name):
    dfYtest['COTIS'] = y
    dfYtest.to_csv('results/{}'.format(file_name), index=False, sep=';')

# Train XGB

In [60]:
import xgboost as xgb
XGBR = xgb.XGBRegressor

In [61]:
def mape(y_true, y_pred): 
    return -np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [62]:
def mape_scorer(est, X, y):
    return mape(y, est.predict(X))

In [63]:
def log_mape_scorer(est, X, y):
    return mape(np.exp(y), np.exp(est.predict(X)))

In [65]:
kwargs = {'objective':'reg:linear', 'missing': -9999, 'seed': 56, 'n_estimators': 500}


clf = XGBR(**kwargs)
clf.fit(x_train, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=-9999, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=56, silent=True, subsample=1)

In [68]:
y_pred = clf.predict(x_validation)
mape(y_validation, y_pred)

-16.881022061685456

Деление на train и test даёт более справедливые результаты. Теперь посмотрим на lmse

In [70]:
%%time

kwargs = {'objective':'reg:linear', 'missing': -9999, 'seed': 56, 'n_estimators': 500}


clf = XGBR(**kwargs)
clf.fit(x_train, np.log(y_train))

CPU times: user 5min 19s, sys: 1.53 s, total: 5min 21s
Wall time: 1min 23s


In [71]:
y_pred = np.exp(clf.predict(x_validation))
mape(y_validation, y_pred)

-16.324240730866656

Теперь попробуем asymmetric mse, но сначала проверим, что мы умеем запускать xgb через стандартный интерфейс

In [82]:
%%time

dtrain = xgb.DMatrix(x_train, label=y_train, missing=-9999)
dvalidation = xgb.DMatrix(x_validation, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.1, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'objective':'reg:linear', 'reg_alpha':0, 'reg_lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

bst = xgb.train(param, dtrain, num_round)

CPU times: user 2min 56s, sys: 177 ms, total: 2min 57s
Wall time: 2min 57s


In [83]:
y_pred = bst.predict(dvalidation)
mape(y_validation, y_pred)

-16.881022061685456

Норм, теперь результаты совпадают. Значения у параметров по умолчанию разные в xgb.train и в sklearn'овских обёртках

### AMSE

Опять сначала проверка

In [119]:
def amse(preds, dtrain, divider=1):
    labels = dtrain.get_label()
    grad = 2 * np.where(preds > labels, (preds - labels), (preds - labels) / divider)
    second_grad = 4 * np.where(preds > labels, 1, 1 / divider)
    return grad, second_grad

In [120]:
def amse_divider(divider):
    def wrapper(*args, **kwargs):
        return amse(*args, divider=divider, **kwargs)
    return wrapper

grad / 2, second_grad / 2

In [107]:
%%time

dtrain = xgb.DMatrix(x_train, label=y_train, missing=-9999)
dvalidation = xgb.DMatrix(x_validation, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.1, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'alpha':0, 'lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

bst = xgb.train(param, dtrain, num_round, obj=amse_divider(1))

CPU times: user 4min 31s, sys: 3.62 s, total: 4min 35s
Wall time: 4min 35s


In [108]:
y_pred = bst.predict(dvalidation)
mape(y_validation, y_pred)

-16.881022061685456

Ок, всё работает (если поделить на два настоящий mse, как и реализованно в xgboost). Теперь немного поправим mse, увеличим градиент в два раза (оставив настоящий), а второй градиент в четыре (в два раза больше настоящего) и уменьше eta.

In [115]:
%%time

dtrain = xgb.DMatrix(x_train, label=y_train, missing=-9999)
dvalidation = xgb.DMatrix(x_validation, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.05, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'alpha':0, 'lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

bst = xgb.train(param, dtrain, num_round, obj=amse_divider(1))

CPU times: user 4min 34s, sys: 3.49 s, total: 4min 37s
Wall time: 4min 37s


In [116]:
y_pred = bst.predict(dvalidation)
mape(y_validation, y_pred)

-11.156682332738244

AMSE

In [125]:
%%time

dtrain = xgb.DMatrix(x_train, label=y_train, missing=-9999)
dvalidation = xgb.DMatrix(x_validation, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.05, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'alpha':0, 'lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

divider = 2

bst = xgb.train(param, dtrain, num_round, obj=amse_divider(divider))

CPU times: user 4min 31s, sys: 3.38 s, total: 4min 35s
Wall time: 4min 35s


In [126]:
y_pred = bst.predict(dvalidation)
mape(y_validation, y_pred)

-9.578534964952782

# Save

In [128]:
%%time

dtrain = xgb.DMatrix(x_bigtrain, label=y_bigtrain, missing=-9999)
dtest = xgb.DMatrix(x_test, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.05, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'alpha':0, 'lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

divider = 2

bst = xgb.train(param, dtrain, num_round, obj=amse_divider(divider))

CPU times: user 4min 42s, sys: 3.86 s, total: 4min 46s
Wall time: 4min 48s


In [136]:
y_pred = bst.predict(dtrain)
mape(y_bigtrain, y_pred)

-10.73870528204605

In [137]:
%%time

dtrain = xgb.DMatrix(x_train, label=y_train, missing=-9999)
dvalidation = xgb.DMatrix(x_validation, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.05, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'alpha':0, 'lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

divider = 2

bst = xgb.train(param, dtrain, num_round, obj=amse_divider(divider))

CPU times: user 4min 59s, sys: 3.86 s, total: 5min 3s
Wall time: 5min 3s


In [138]:
y_pred = bst.predict(dtrain)
mape(y_train, y_pred)

-10.746545997142359

In [139]:
y_predict = bst.predict(dtest)

In [141]:
def amse(preds, dtrain, divider=1):
    labels = dtrain.get_label()
    grad = 1 * np.where(preds > labels, (preds - labels), (preds - labels) / divider)
    second_grad = 1 * np.where(preds > labels, 1, 1 / divider)
    return grad, second_grad

In [142]:
def amse_divider(divider):
    def wrapper(*args, **kwargs):
        return amse(*args, divider=divider, **kwargs)
    return wrapper

In [143]:
%%time

dtrain = xgb.DMatrix(x_bigtrain, label=y_bigtrain, missing=-9999)
dtest = xgb.DMatrix(x_test, missing=-9999)

param =   {'base_score':0.5, 'colsample_bylevel':1, 'colsample_bytree':1, 'gamma':0,
           'eta':0.1, 'max_delta_step':0, 'max_depth':3,
           'min_child_weight':1, 'nthread':-1,
           'alpha':0, 'lambda':1,
           'scale_pos_weight':1, 'seed':56, 'silent':True, 'subsample':1}
num_round = 500

divider = 2

bst = xgb.train(param, dtrain, num_round, obj=amse_divider(divider))

CPU times: user 4min 40s, sys: 3.75 s, total: 4min 44s
Wall time: 4min 45s


In [145]:
y_pred = bst.predict(dtrain)
mape(y_bigtrain, y_pred)

-9.578671529090105

In [146]:
y_predict = bst.predict(dtest)

In [147]:
save_to_file(y_predict, 'xbg_500_amse.csv')