In [67]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.preprocessing import QuantileTransformer
import matplotlib.pyplot as plt

In [68]:
DATA_DIR = 'problem'
SOLUTION_DIR = '.'
TRAIN_FN = 'training.csv'
TEST_FN = 'testing.csv'
SOLUTION_FN = 'solution.csv'
SEP = '/'

## Raw Data

In [69]:
train_path = DATA_DIR + SEP + TRAIN_FN
test_path = DATA_DIR + SEP + TEST_FN
solution_path = SOLUTION_DIR + SEP + SOLUTION_FN
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
solution = pd.read_csv(solution_path)

In [70]:
# Add 'month' feature
df_train['month'] = pd.to_datetime(df_train.Date).dt.month
df_test['month'] = pd.to_datetime(df_test.Date).dt.month

## Create Train, Validation, and Test Sets

In [71]:
train_range = df_train.Date<'2014-07-23'
valid_range = df_train.Date>='2014-10-23'
train = df_train[train_range]
valid = df_train[valid_range]
test = df_test

In [72]:
X_train = train.drop(['Date', 'Identifier', 'Dep_Var'], axis=1)
y_train = train.Dep_Var
X_valid = valid.drop(['Date', 'Identifier', 'Dep_Var'], axis=1)
y_valid = valid.Dep_Var
X_test = test.drop(['Date', 'Identifier', 'Dep_Var'], axis=1)

## Transform Dep_Var

In [73]:
q = QuantileTransformer(output_distribution='normal')
y_train_gauss = pd.Series( q.fit_transform(y_train.values.reshape(-1, 1)).reshape(-1))

In [74]:
check = q.inverse_transform(y_train_gauss.values.reshape(-1, 1)).reshape(-1)
pd.DataFrame({'raw':y_train, 'check':check, 'gauss':y_train_gauss}).head()
np.allclose(check, y_train)

True

## Fit GBM Model to Training Subset and Predict Validation Set

In [75]:
X = X_train.astype(np.float32)
d_train = lgb.Dataset(X, label=y_train_gauss)

### Original validation result (without bug)

In [76]:
params = {}
params['learning_rate'] = 0.004 # 0.004 # shrinkage_rate
params['sub_feature'] = 0.35    # feature_fraction (small values => use very different submodels)
params['min_data'] = 500        # min_data_in_leaf
params['max_depth'] = 2

nboost=130

reg = lgb.train(params, d_train, nboost)
pred = reg.predict(X_valid)

n_minus = 30
params['max_depth'] = 3
reg = lgb.train(params, d_train, nboost-n_minus)
alpha = 0.5
pred = alpha*pred + (1-alpha)*reg.predict(X_valid)

n_plus = 40
params['max_depth'] = 2
params['bagging_freq'] = 5
params['bagging_fraction'] = 0.78
params['learning_rate'] = .005
reg = lgb.train(params, d_train, nboost+n_plus)

alpha = 0.78
pred = alpha*pred + (1-alpha)*reg.predict(X_valid)

pred_uniform = q.inverse_transform(pred.reshape(-1, 1)).reshape(-1)
y_pred = np.round(pred_uniform).astype(np.int)

spearmanr(y_valid, y_pred)

SpearmanrResult(correlation=0.10850310725319776, pvalue=7.577737140155415e-34)

### Simulate validation result with bug

In [77]:
params['max_depth'] = 2
nboost=130

reg = lgb.train(params, d_train, nboost)
pred = reg.predict(X_valid)

n_minus = 30
params['max_depth'] = 3
reg = lgb.train(params, d_train, nboost-n_minus)
alpha = 0.5
pred = alpha*pred + (1-alpha)*reg.predict(X_valid)

n_plus = 40
params['max_depth'] = 2
params['bagging_freq'] = 5
params['bagging_fraction'] = 0.78
params['learning_rate'] = .005
reg = lgb.train(params, d_train, nboost+n_plus)

alpha = 0.78
pred = alpha*pred + (1-alpha)*reg.predict(X_valid)

pred_uniform = q.inverse_transform(pred.reshape(-1, 1)).reshape(-1)
y_pred = np.round(pred_uniform).astype(np.int)

spearmanr(y_valid, y_pred)

SpearmanrResult(correlation=0.10470714057225206, pvalue=1.2661271970807093e-31)

## Fit same model to full training data and predict test data

In [78]:
bigger = len(df_train)/len(train)
full_nboost = int(nboost*bigger)
n_minus = int(n_minus*bigger)
n_plus = int(n_plus*bigger)
full_nboost

225

### Original test result (with bug)

In [79]:
full_X = df_train.drop(['Date', 'Identifier', 'Dep_Var'], axis=1)
full_y = df_train.Dep_Var
full_y_gauss = pd.Series(q.fit_transform(full_y.values.reshape(-1, 1)).reshape(-1))
d_full_train = lgb.Dataset(full_X.values.astype(np.float32), label=full_y_gauss)

params['max_depth'] = 2
full_reg = lgb.train(params, d_full_train, full_nboost)
test_pred_gauss = full_reg.predict(X_test)

params['max_depth'] = 3
full_reg = lgb.train(params, d_full_train, full_nboost-n_minus)
alpha = 0.5
test_pred_gauss = alpha*test_pred_gauss + (1-alpha)*full_reg.predict(X_test)

params['max_depth'] = 2
params['bagging_freq'] = 5
params['bagging_fraction'] = 0.78
params['learning_rate'] = .005
full_reg = lgb.train(params, d_full_train, full_nboost+n_plus)
alpha = 0.78
test_pred_gauss = alpha*test_pred_gauss + (1-alpha)*full_reg.predict(X_test)


test_pred_uniform = q.inverse_transform(test_pred_gauss.reshape(-1, 1)).reshape(-1)
test_pred = np.round(test_pred_uniform).astype(np.int)
spearmanr(solution.Dep_Var, test_pred)

SpearmanrResult(correlation=0.10933380428545146, pvalue=6.97545350538751e-35)

(Apparently the test criterion used was slightly different, somehow taking into account consistency.  Hence the value here is different than the one reported for the competition, but it should be generally indicative of the relative result.)

### Simulated test result without bug

In [80]:
params = {}
params['learning_rate'] = 0.004 # 0.004 # shrinkage_rate
params['sub_feature'] = 0.35    # feature_fraction (small values => use very different submodels)
params['min_data'] = 500        # min_data_in_leaf
params['max_depth'] = 2
full_reg = lgb.train(params, d_full_train, full_nboost)
test_pred_gauss = full_reg.predict(X_test)

params['max_depth'] = 3
full_reg = lgb.train(params, d_full_train, full_nboost-n_minus)
alpha = 0.5
test_pred_gauss = alpha*test_pred_gauss + (1-alpha)*full_reg.predict(X_test)

params['max_depth'] = 2
params['bagging_freq'] = 5
params['bagging_fraction'] = 0.78
params['learning_rate'] = .005
full_reg = lgb.train(params, d_full_train, full_nboost+n_plus)
alpha = 0.78
test_pred_gauss = alpha*test_pred_gauss + (1-alpha)*full_reg.predict(X_test)


test_pred_uniform = q.inverse_transform(test_pred_gauss.reshape(-1, 1)).reshape(-1)
test_pred = np.round(test_pred_uniform).astype(np.int)
spearmanr(solution.Dep_Var, test_pred)

SpearmanrResult(correlation=0.10892773890449726, pvalue=1.2332473701811744e-34)

So it looks like the bug would have made the validation result worse but made the test result better.  "Carelessness as a form of regularization" is a thing.