In [1]:
LAST_TRAINSET_TIME = 32 # 29 and 32

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
totalstart=time()
from sklearn.metrics import mean_squared_error as MSE

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor, plot_importance

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
all_data = pd.read_pickle('data/features_submit_all.p')#.sample(frac=0.001) #features_develop/submit
minus_cheaps_data = pd.read_pickle('data/features_submit_minus_cheaps.p')#.sample(frac=0.001) #features_develop/submit
print(all_data.info(max_cols=1))
print(minus_cheaps_data.info(max_cols=1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555656 entries, 0 to 4555655
Columns: 47 entries, count to revenue_3_mean
dtypes: float16(23), float32(8), int16(8), int8(8)
memory usage: 443.2 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555656 entries, 0 to 4555655
Columns: 47 entries, count to revenue_3_mean
dtypes: float16(23), float32(9), int16(6), int8(9)
memory usage: 447.5 MB
None


In [3]:
models = [
{
    'name': 'XGB all',
    'model': XGBRegressor,
    'data': all_data,
    'params': {
        'n_jobs': -1,
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'seed': 123,

        'n_estimators': 1000,

        'subsample': 0.7,
        'colsample_bytree': 0.15,

        'learning_rate': 0.1,
        'max_depth': 2,
        'min_child_weight': 50
    }
},

{
    'name': 'XGB minus cheaps',
    'model': XGBRegressor,
    'data': minus_cheaps_data,
    'params': {
        'n_jobs': -1,
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'seed': 123,

        'n_estimators': 1000,

        'subsample': 0.8,
        'colsample_bytree': 0.8,

        'learning_rate': 0.1,
        'max_depth': 4,
        'min_child_weight': 50
    }
},

{
    'name': 'GBR all',
    'model': GradientBoostingRegressor,
    'data': all_data,
    'params': {
        'verbose': 0,
        'random_state': 123,

        'n_estimators': 150,
        'n_iter_no_change': 20,
        'tol': 0.005,

        'subsample': 0.7,
        'max_features': 0.1,

        'learning_rate': 0.3,
        'max_depth': 2,
        'min_samples_leaf': 500
    }
},

{
    'name': 'GBR minus cheaps',
    'model': GradientBoostingRegressor,
    'data': minus_cheaps_data,
    'params': {
        'verbose': 0,
        'random_state': 123,

        'n_estimators': 500,

        'subsample': 0.9,
        'max_features': 0.85,

        'learning_rate': 0.3,
        'max_depth': 4,
        'min_samples_leaf': 750
    }
},

{
    'name': 'RF all',
    'model': RandomForestRegressor,
    'data': all_data,
    'params': {
        'n_jobs': -1,
        'random_state': 123,
        'n_estimators': 300,

        'max_features': 0.15,
        'max_depth': 13,
        'min_samples_leaf': 1000
    }
},

{
    'name': 'RF minus cheaps',
    'model': RandomForestRegressor,
    'data': minus_cheaps_data,
    'params': {
        'n_jobs': -1,
        'random_state': 123,
        'n_estimators': 300,

        'max_features': 0.25,
        'max_depth': 11,
        'min_samples_leaf': 500
    }
}
]

In [4]:
output = None

for model in models:
    
    name = model['name']
    model_class = model['model']
    data = model['data']
    model_params = model['params']

    start = time()
    model = model_class(**model_params)
    
    train = data[data['time'] <= LAST_TRAINSET_TIME]
    validate = data[(data['time'] > LAST_TRAINSET_TIME) & (data['time'] < 34)]
    predict = data[data['time'] == 34]

    train_X = train.drop('count', axis=1)
    train_y = train['count']
    del train
    validate_X = validate.drop('count', axis=1)
    validate_y = validate['count']
    validate_join = validate[['item', 'shop', 'time', 'count']]
    del validate
    predict_X = predict.drop('count', axis=1)
    predict_join = predict[['item', 'shop', 'time', 'count']]
    del predict
    del data

    if model_class == XGBRegressor:
        fit_params = {
            'early_stopping_rounds': 50
        }
        
        eval_set = [(train_X, train_y), (validate_X, validate_y)]
        model.fit(train_X, train_y, eval_set=eval_set, verbose=False, **fit_params)

    else:
        model.fit(train_X, train_y)

    if model_class == XGBRegressor:
        if 'early_stopping_rounds' in fit_params.keys() or 'n_estimators' not in model_params.keys():
            ntree_limit = model.best_ntree_limit
        else:
            ntree_limit = 50

    if model_class == XGBRegressor:
        train_prediction = model.predict(train_X, ntree_limit=ntree_limit)
        validate_prediction = model.predict(validate_X, ntree_limit=ntree_limit)
        predict_prediction = model.predict(predict_X, ntree_limit=ntree_limit)
    else:
        train_prediction = model.predict(train_X)
        validate_prediction = model.predict(validate_X)
        predict_prediction = model.predict(predict_X)
    
    del train_X
    del validate_X
    del predict_X
    
    print(name, np.sqrt(MSE(train_prediction, train_y)), np.sqrt(MSE(validate_prediction, validate_y)))
    
    if LAST_TRAINSET_TIME == 32:
        this_output = predict_join[['item', 'shop', 'time']]
        this_output[name] = predict_prediction
    else:
        this_output = validate_join[['item', 'shop', 'time']]
        this_output[name] = validate_prediction
    
    if output is None:
        output = this_output
    else:
        output = pd.merge(output, this_output, how='inner', on=['item', 'shop', 'time'])

if LAST_TRAINSET_TIME == 32:
    output = pd.merge(output, predict_join, how='inner', on=['item', 'shop', 'time'])
else:
    output = pd.merge(output, validate_join, how='inner', on=['item', 'shop', 'time'])

XGB all 0.70163244 0.82552725
XGB minus cheaps 0.6720879 0.7830488
GBR all 0.6917166184633453 0.8330738430599288
GBR minus cheaps 0.6399271233700499 0.7843077239056717
RF all 0.6756236919868033 0.835198667732695
RF minus cheaps 0.7050390561556447 0.7857661823043347


In [5]:
output.to_pickle('data/ensemble_submit.p') # ensemble_develop/submit

In [6]:
print('total time:', round((time()-totalstart)/60, 1), 'min')
print('total time:', round((time()-totalstart)/60/60, 1), 'h')

total time: 281.2 min
total time: 4.7 h


In [7]:
for data in [all_data, minus_cheaps_data]:
    data = data[(data['time'] > LAST_TRAINSET_TIME) & (data['time'] < 34)]
    print(len(data)) # have to be same and same as output length

output.info()

198072
198072
<class 'pandas.core.frame.DataFrame'>
Int64Index: 198072 entries, 0 to 198071
Data columns (total 10 columns):
item                198072 non-null int16
shop                198072 non-null int8
time                198072 non-null int8
XGB all             198072 non-null float32
XGB minus cheaps    198072 non-null float32
GBR all             198072 non-null float64
GBR minus cheaps    198072 non-null float64
RF all              198072 non-null float64
RF minus cheaps     198072 non-null float64
count               198072 non-null int16
dtypes: float32(2), float64(4), int16(2), int8(2)
memory usage: 10.2 MB


In [8]:
output.head(20)

Unnamed: 0,item,shop,time,XGB all,XGB minus cheaps,GBR all,GBR minus cheaps,RF all,RF minus cheaps,count
0,30,2,34,0.015962,0.065874,-0.068465,0.032309,0.08898,0.053635,0
1,31,2,34,0.166572,0.253893,0.038145,0.140293,0.154328,0.249916,0
2,32,2,34,0.248509,0.228818,0.353395,0.214236,0.373704,0.228383,0
3,33,2,34,0.151981,0.240572,0.126416,0.223524,0.274846,0.242289,0
4,38,2,34,-0.073268,-0.01156,-0.033774,0.001381,0.004396,0.001747,0
5,42,2,34,-0.017594,-0.014132,-0.031449,-0.023249,0.001431,0.000225,0
6,45,2,34,-0.072535,-0.018087,-0.048905,0.00052,0.001405,0.000225,0
7,51,2,34,-0.025675,-0.014313,-0.031449,0.000931,0.001405,0.000225,0
8,53,2,34,0.063994,0.030665,-0.021976,-0.014272,0.028433,0.043233,0
9,57,2,34,-0.072535,-0.016165,-0.044253,-0.009406,0.00143,0.000388,0
