In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score as AUC
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 200)

In [2]:
# all features with big XGB model importance measure > 0.0012 (188/1136)
best_features = ['ord_2_x_ord_3', 'ord_0_x_ord_3', 'bin_0', 'ord_0_x_ord_2', 'ord_3_x_ord_5', 'ord_5_x_month', 
'bin_2_x_month', 'ord_0_x_month', 'ord_1_x_month', 'ord_0_x_ord_5', 'ord_2_x_month', 'ord_0_x_ord_4', 
'day_x_x_nom_1_Triangle', 'ord_2_x_ord_5', 'bin_2_x_ord_4', 'nom_1_target_mean', 'month_x_nom_4_Bassoon', 
'nom_8_target_mean', 'bin_0_target_mean', 'nom_3_target_mean', 'nom_9_target_mean', 'nom_7_target_mean', 
'ord_1_x_ord_3', 'ord_4_x_ord_5', 'ord_1_x_ord_4', 'day_target_mean', 'day_x_x_nom_3_India', 'bin_0_x_nom_3_lat', 
'bin_0_x_ord_5', 'ord_2', 'bin_1', 'ord_1_x_ord_2', 'month_x_nom_1_Trapezoid', 'bin_2_x_ord_3', 
'ord_4_x_nom_3_Russia', 'nom_5_target_mean', 'month_x_nom_3_Russia', 'ord_2_x_nom_4_wind', 'bin_2_x_ord_2', 
'nom_2_target_mean', 'month_target_mean', 'nom_1_Triangle', 'ord_4_x_nom_4_Bassoon', 'ord_0_x_ord_1', 
'bin_2_x_ord_5', 'bin_2_x_ord_0', 'ord_2_target_mean', 'ord_4_x_day_x', 'bin_2_x_ord_1', 'ord_3_x_nom_4_wind', 
'month_x_day_x', 'bin_2_x_nom_4_Bassoon', 'nom_4_Piano', 'ord_0_x_nom_4_Piano', 'ord_4_x_nom_1_Trapezoid', 
'bin_0_x_month_x', 'ord_3_x_nom_1_Trapezoid', 'nom_6_target_mean', 'ord_0_x_nom_4_Bassoon', 'bin_4_x_nom_4_Bassoon', 
'nom_4_target_mean', 'ord_3', 'ord_2_x_nom_1_Trapezoid', 'ord_3_x_nom_4_Bassoon', 'ord_3_x_nom_3_Russia', 
'bin_4_x_month', 'bin_2_x_bin_4', 'bin_0_x_month', 'ord_3_x_day_x', 'bin_1_x_day', 'day_x_x_nom_4_Bassoon', 
'ord_2_x_day_x', 'ord_2_x_nom_4_Bassoon', 'ord_4_x_nom_2_Lion', 'nom_0_Blue_x_nom_3_India', 'month_x_nom_2_Lion', 
'ord_0_x_nom_4_wind', 'nom_4_wind_x_nom_3_Russia', 'month_x_x_nom_0_Red', 'nom_8_was_na', 'ord_1_x_ord_5', 
'bin_1_x_month_x', 'bin_4_x_nom_0_Blue', 'ord_1_x_nom_2_Lion', 'nom_4_wind_x_nom_2_Lion', 'bin_0_x_ord_3', 
'ord_5_target_mean', 'ord_0_x_day_x', 'bin_4_x_ord_3', 'bin_1_x_nom_4_wind', 'bin_4_x_nom_3_Russia', 
'nom_0_Blue_x_nom_2_Lion', 'nom_1_Polygon_x_nom_4_Bassoon', 'bin_2_x_day_x', 'bin_4_x_day_x', 'ord_5_x_nom_3_Russia', 
'ord_0', 'ord_1_x_nom_4_Bassoon', 'bin_2_was_na', 'ord_4_x_nom_0_Blue', 'ord_2_x_nom_4_Piano', 'day_x_day_y', 
'ord_1_x_day_x', 'bin_2_x_nom_1_Trapezoid', 'ord_5_x_nom_4_Piano', 'bin_2_x_nom_0_Blue', 'ord_3_target_mean', 
'month_x_nom_0_Blue', 'bin_1_x_nom_4_Theremin', 'ord_0_x_nom_3_India', 'ord_1_x_nom_1_Trapezoid', 
'ord_5_x_nom_1_Trapezoid', 'ord_4_x_day_y', 'bin_4_x_ord_4', 'bin_0_x_ord_0', 'ord_4_target_mean', 
'ord_0_x_nom_2_Lion', 'day_x_x_nom_0_Blue', 'bin_2_x_nom_3_lat', 'ord_3_x_nom_4_Piano', 'ord_3_x_nom_0_Blue', 
'nom_9_was_na', 'ord_4_x_nom_4_wind', 'nom_3_Russia_x_nom_4_Bassoon', 'day_x_x_nom_1_Trapezoid', 
'day_y_x_nom_1_Triangle', 'day_y_x_nom_4_Bassoon', 'ord_3_was_na', 'ord_5_x_nom_4_Bassoon', 'bin_0_x_day', 
'bin_4_x_ord_1', 'day_x_x_nom_3_Russia', 'nom_1_Star_x_nom_4_Theremin', 'ord_2_x_nom_2_Lion', 'bin_4_x_nom_4_wind', 
'bin_0_x_ord_4', 'bin_2_x_nom_2_Lion', 'bin_1_x_ord_4', 'ord_2_x_nom_1_Star', 'month_y', 'ord_3_x_nom_2_Lion', 
'bin_4_x_ord_2', 'bin_1_x_ord_5', 'ord_2_x_ord_4', 'nom_1_Trapezoid_x_nom_4_Bassoon', 'nom_7_was_na', 
'month_x_x_nom_3_Finland', 'nom_0_Red_x_nom_4_Theremin', 'day_x_nom_1_Trapezoid', 'day_x_nom_4_Piano', 
'month_y_x_nom_0_Blue', 'nom_5_was_na', 'month_was_na', 'nom_3_Costa Rica_x_nom_4_Bassoon', 
'nom_3_lat_x_nom_1_Triangle', 'bin_3_x_nom_0_Red', 'bin_2_x_nom_3_Russia', 'day_x_nom_4_Bassoon', 
'ord_5_x_nom_3_Costa Rica', 'month_x_nom_4_Piano', 'day_x_x_nom_3_Costa Rica', 'ord_0_x_nom_2_Axolotl', 
'month_x_nom_2_Axolotl', 'nom_1_Trapezoid_x_nom_3_Russia', 'day_x_nom_2_Axolotl', 'ord_4_x_nom_1_Triangle', 
'ord_5_x_nom_3_China', 'nom_0_Red_x_nom_1_Circle', 'ord_4_x_nom_1_Polygon', 'ord_2_was_na', 'ord_5_x_day_x', 
'ord_5_x_nom_3_Finland', 'nom_4_wind_x_month_y', 'bin_1_x_nom_3_India', 'bin_2_x_nom_4_wind', 'ord_0_x_nom_1_Polygon', 
'month_x_day_y', 'month_y_x_nom_0_Red', 'ord_5', 'day_x_nom_1_Star', 'ord_4_x_nom_2_Axolotl', 'ord_0_x_nom_3_Russia', 
'bin_3_x_nom_2_Hamster', 'ord_2_x_nom_3_Russia', 'day_x_x_nom_2_Axolotl', 'month_x_nom_0_Green', 
'nom_0_Green_x_nom_2_Axolotl', 'ord_3_x_day_y']

train = pd.read_pickle('data/train.p')[best_features + ['target', 'id']]#.sample(frac=0.001)
validation = pd.read_pickle('data/validation.p')[best_features + ['target', 'id']]
test = pd.read_pickle('data/test.p')[best_features + ['id']]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480000 entries, 0 to 479999
Columns: 190 entries, ord_2_x_ord_3 to id
dtypes: float16(134), float32(8), int16(10), int32(1), int8(37)
memory usage: 165.3 MB


In [3]:
%%time

train_X = train.drop('target', axis=1)
train_y = train['target']
del train
validate_X = validation.drop('target', axis=1)
validate_y = validation['target']
del validation
predict_X = test
del test

"""
model_class = RandomForestRegressor
params = {
    'n_jobs': [-1],
    'random_state': [123],
    'n_estimators': [150],
    
    'max_features': [0.25],
    'max_samples': [0.8],
    
    'max_depth': [None],
    'min_samples_leaf': [50]
}
"""

model_class = XGBRegressor
params = {
    'n_jobs': [-1],
    'random_state': [123],
    'n_estimators': [5000],
    'learning_rate': [0.2],
    
    'max_depth': [1],
    'min_child_weight': [2500],

    'subsample': [0.85],
    'colsample_bytree': [0.55],
    
    'reg_alpha': [0],
    'reg_lambda': [0]
}

results = {}
for key in params.keys():
    results[key] = []
results['train'] = []
results['validate'] = []

if model_class == XGBRegressor:
    results['trees'] = []

n = 0

for combo in ParameterGrid(params):
    n += 1
    print(n)
    
    model_params = {
        'n_jobs': combo['n_jobs'],
        'random_state': combo['random_state'],
        'n_estimators': combo['n_estimators'],
        
        'learning_rate': combo['learning_rate'],
        'max_depth': combo['max_depth'],
        'min_child_weight': combo['min_child_weight'],
        
        'subsample': combo['subsample'],
        'colsample_bytree': combo['colsample_bytree'],
        
        'reg_alpha': combo['reg_alpha'],
        'reg_lambda': combo['colsample_bytree']
    }

    model = model_class(**model_params)

    if model_class == XGBRegressor:
        fit_params = {
            'early_stopping_rounds': 50
        }
        
        eval_set = [(train_X, train_y), (validate_X, validate_y)]
        model.fit(train_X, train_y, eval_set=eval_set, verbose=False, **fit_params)
    else:
        model.fit(train_X, train_y)

    if model_class == XGBRegressor:
        if 'early_stopping_rounds' in fit_params.keys() or 'n_estimators' not in model_params.keys():
            ntree_limit = model.best_ntree_limit
        else:
            ntree_limit = model_params['n_estimators']
        results['trees'].append(ntree_limit)

    if model_class == XGBRegressor:
        train_prediction = model.predict(train_X, ntree_limit=ntree_limit)
        validate_prediction = model.predict(validate_X, ntree_limit=ntree_limit)
    else:
        train_prediction = model.predict(train_X)
        validate_prediction = model.predict(validate_X)

    for key in params.keys():
        results[key].append(combo[key])

    results['train'].append(AUC(train_y, train_prediction))
    results['validate'].append(AUC(validate_y, validate_prediction))

results = pd.DataFrame(results)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
Wall time: 6h 52min 28s


In [4]:
results = results.sort_values(by='validate', ascending=False)
results['overfit'] = results['train'] - results['validate']
results['overfit_ratio'] = results['train'] / results['validate']
results

Unnamed: 0,n_jobs,random_state,n_estimators,learning_rate,max_depth,min_child_weight,subsample,colsample_bytree,reg_alpha,reg_lambda,train,validate,trees,overfit,overfit_ratio
0,-1,123,10000,0.2,1,2500,0.85,0.55,0.0,0.0,0.793271,0.779644,557,0.013627,1.017478
2,-1,123,10000,0.2,1,2500,0.85,0.55,0.0,1.0,0.793271,0.779644,557,0.013627,1.017478
3,-1,123,10000,0.2,1,2500,0.85,0.55,0.0,5.0,0.793271,0.779644,557,0.013627,1.017478
4,-1,123,10000,0.2,1,2500,0.85,0.55,0.0,10.0,0.793271,0.779644,557,0.013627,1.017478
1,-1,123,10000,0.2,1,2500,0.85,0.55,0.0,0.5,0.793271,0.779644,557,0.013627,1.017478
5,-1,123,10000,0.2,1,2500,0.85,0.55,0.5,0.0,0.793223,0.77964,541,0.013583,1.017422
6,-1,123,10000,0.2,1,2500,0.85,0.55,0.5,0.5,0.793223,0.77964,541,0.013583,1.017422
7,-1,123,10000,0.2,1,2500,0.85,0.55,0.5,1.0,0.793223,0.77964,541,0.013583,1.017422
8,-1,123,10000,0.2,1,2500,0.85,0.55,0.5,5.0,0.793223,0.77964,541,0.013583,1.017422
9,-1,123,10000,0.2,1,2500,0.85,0.55,0.5,10.0,0.793223,0.77964,541,0.013583,1.017422
