In [1]:
LAST_TRAINSET_TIME = 33

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
totalstart=time()
from sklearn.metrics import mean_squared_error as MSE

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import ParameterGrid

pd.set_option('display.max_columns', 100)

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('data/ensemble_develop.p')#.sample(frac=0.001)
print(data.info())
submit_data = pd.read_pickle('data/ensemble_submit.p')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 792288 entries, 0 to 792287
Data columns (total 10 columns):
item                792288 non-null int16
shop                792288 non-null int8
time                792288 non-null int8
XGB all             792288 non-null float32
XGB minus cheaps    792288 non-null float32
GBR all             792288 non-null float64
GBR minus cheaps    792288 non-null float64
RF all              792288 non-null float64
RF minus cheaps     792288 non-null float64
count               792288 non-null int16
dtypes: float32(2), float64(4), int16(2), int8(2)
memory usage: 40.8 MB
None


In [3]:
model_class = XGBRegressor
params = {
    'n_jobs': [-1],
    'objective': ['reg:squarederror'],
    'seed': [123],
    
    'learning_rate': [0.05],
    'n_estimators': [330, 60],
    #'early_stopping_rounds': [50],

    'subsample': [0.6],
    'colsample_bytree': [0.9],
    
    'max_depth': [1],
    'min_child_weight': [100],
    'reg_alpha': [0.5],
    'reg_lambda': [0.5]
}

In [4]:
train = data[data['time'] <= LAST_TRAINSET_TIME]
validate = data[(data['time'] > LAST_TRAINSET_TIME) & (data['time'] < 34)]

train_X = train.drop('count', axis=1)
train_y = train['count']
del train
validate_X = validate.drop('count', axis=1)
validate_y = validate['count']
del validate
predict_X = submit_data.drop('count', axis=1)
del data
del submit_data

results = {}
for key in params.keys():
    results[key] = []
results['train'] = []
#results['validate'] = []
results['trees'] = []

n = 0

for combo in ParameterGrid(params):
    n += 1
    print(n)
    
    model_params = {
        'n_jobs': combo['n_jobs'],
        'objective': combo['objective'],
        'seed': combo['seed'],
        
        'n_estimators': combo['n_estimators'],
        
        'subsample': combo['subsample'],
        'colsample_bytree': combo['colsample_bytree'],

        'learning_rate': combo['learning_rate'],
        'max_depth': combo['max_depth'],
        'min_child_weight': combo['min_child_weight'],
        
        'reg_alpha': combo['reg_alpha'],
        'reg_lambda': combo['reg_lambda']
    }

    model = model_class(**model_params)

    if model_class == XGBRegressor:
        fit_params = {
            #'early_stopping_rounds': combo['early_stopping_rounds']
        }
        
        eval_set = [(train_X, train_y), (validate_X, validate_y)]
        #model.fit(train_X, train_y, eval_set=eval_set, verbose=True, **fit_params)
        model.fit(train_X, train_y, verbose=True, **fit_params)
    else:
        model.fit(train_X, train_y)

    if model_class == XGBRegressor:
        if 'early_stopping_rounds' in fit_params.keys() or 'n_estimators' not in model_params.keys():
            ntree_limit = model.best_ntree_limit
        else:
            ntree_limit = model_params['n_estimators']
        results['trees'].append(ntree_limit)

    if model_class == XGBRegressor:
        train_prediction = model.predict(train_X, ntree_limit=ntree_limit)
        #validate_prediction = model.predict(validate_X, ntree_limit=ntree_limit)
        final_prediction = model.predict(predict_X, ntree_limit=ntree_limit)
    else:
        train_prediction = model.predict(train_X)
        validate_prediction = model.predict(validate_X)
        #final_prediction = model.predict(predict_X)

    for key in params.keys():
        results[key].append(combo[key])

    results['train'].append(np.sqrt(MSE(train_prediction, train_y)))
    #results['validate'].append(np.sqrt(MSE(validate_prediction, validate_y)))
    
    if LAST_TRAINSET_TIME == 33:
        filename = 'data/submission_{}.csv'.format(model_params['n_estimators'])
        print(filename, model_class, model_params['n_estimators'])

        final_prediction = np.clip(final_prediction, 0, 20)

        prediction = predict_X[['shop', 'item']]
        prediction.columns = ['shop_id', 'item_id']
        prediction['item_cnt_month'] = final_prediction
        prediction = pd.merge(prediction, pd.read_csv('data/test.csv'), how='right')
        prediction['item_cnt_month'].fillna(0, inplace=True) # maybe something better can be done?

        prediction.sort_values(by='ID', inplace=True)
        prediction = prediction[['ID', 'item_cnt_month']]

        prediction.to_csv(filename, index=False)

results = pd.DataFrame(results)

1
data/submission_330.csv <class 'xgboost.sklearn.XGBRegressor'> 330
2
data/submission_60.csv <class 'xgboost.sklearn.XGBRegressor'> 60


In [5]:
#results = results.sort_values(by='validate')
#results['overfit'] = results['validate'] - results['train']
#results['overfit_ratio'] = results['validate'] / results['train']
results

Unnamed: 0,n_jobs,objective,seed,learning_rate,n_estimators,subsample,colsample_bytree,max_depth,min_child_weight,reg_alpha,reg_lambda,train,trees
0,-1,reg:squarederror,123,0.05,330,0.6,0.9,1,100,0.5,0.5,0.723163,330
1,-1,reg:squarederror,123,0.05,60,0.6,0.9,1,100,0.5,0.5,0.761941,60


In [6]:
print('total time:', round((time()-totalstart)/60, 1), 'min')
print('total time:', round((time()-totalstart)/60/60, 1), 'h')

total time: 1.8 min
total time: 0.0 h
