In [1]:
LAST_TRAINSET_TIME = 29 # 29 for develop, 32 for submit

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
totalstart=time()
from sklearn.metrics import mean_squared_error as MSE

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor, plot_importance

pd.set_option('display.max_columns', 100)

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

figsize = (6,14)

def plot_importances(model, error=False, title=''):
    
    importances = model.feature_importances_
    indices = np.argsort(importances)#[::-1]

    fig, ax = plt.subplots(figsize=figsize)
    if error:
        std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
        plt.barh(range(train_X.shape[1]), importances[indices], xerr=std[indices], align="center")
    else:
        plt.barh(range(train_X.shape[1]), importances[indices], align="center")

    plt.yticks(range(train_X.shape[1]), [train_X.columns[ix] for ix in indices], rotation='horizontal')
    plt.ylim([-1, train_X.shape[1]])
    ax.xaxis.tick_top()
    plt.title(title, y=1.03)
    plt.show()

def plot_xgboost_importances(booster, model_name):
    fig, ax = plt.subplots(1,1,figsize=figsize)
    plot_importance(booster=booster, ax=ax, height=0.8, title=model_name+' - uses', importance_type='weight')
    plt.show()
    fig, ax = plt.subplots(1,1,figsize=figsize)
    plot_importance(booster=booster, ax=ax, height=0.8, title=model_name+' - avg gain', importance_type='gain')
    plt.show()
    fig, ax = plt.subplots(1,1,figsize=figsize)
    plot_importance(booster=booster, ax=ax, height=0.8, title=model_name+' - avg affected samples', importance_type='cover')
    plt.show()

def plot_tree_usefulness(model, validate_X, validate_y, model_name):
    predictions = []
    for tree in model.estimators_:
        predictions.append(tree.predict(validate_X))
        
    predictions = np.vstack(predictions)
    predictions = np.cumsum(predictions, axis=0)
    predictions = [predictions[i]/(i+1) for i in range(len(predictions))] #cum mean

    scores = []
    for pred in predictions:
        scores.append(np.sqrt(MSE(pred, validate_y)))
        
    plt.figure(figsize=(8, 4))
    plt.plot(scores, linewidth=3)
    plt.xlabel('trees')
    plt.ylabel('rmse')
    plt.title(model_name+' tree usefulnesses')
    plt.show()

In [2]:
# 1-cheap, 2-powerful, 3-third, S-shit; all,minus cheaps,harsh 
model_features = [
    'count', # label, dropped later
    'item', # need for prediction S,S,3,
    'shop', # need for prediction S,S,S,
    'time', # need for prediction 3,S,3,3
    
    'shop_city', # S,S,S,3x trash candidate, all trash
    'shop_type', # S,S,S,3x trash candidate, all trash
    'category', # S,S,S,3k trash candidate
    'larger_category', # S,S,S,3x trash candidate, all trash
    'item_return_percent', # S,3,-,
    
    'month', # 3,-,-,3 cheap candidate, old cheap
    #'year', # S,S,S,
    #'month_length', # S,S,S,
    #'holidays_in_month', # S,S,S,
    
    #'shop_mean_count', # S,S,33,3
    'category_mean_count', # S,3,S,
    'item_mean_count', # 3,-,-,2 cheap candidate, old cheap, chosen cheap, best cheap subset
    'shop_item_mean_count', # 1,-,-, cheap candidate, old cheap, chosen cheap, best cheap subset
    'shop_category_mean_count', # S,2,-,2 cheap candidate, best cheap subset
    
    'count_trend', # 3,3,-,3
    'item_count_trend', # 2,2,-, cheap candidate
    'item_price_trend', # S,S,S,2k trash candidate
    'months_since_first_item_sale', # 2,3,-, cheap candidate
    'months_since_last_item_sale', # S3,S3,-,3
    'months_since_first_sale', # 2,2,-, cheap candidate
    'months_since_last_sale', # 3,3,-,3
    
    'item_name_vector_1', # S,S,S32, trash candidate, all trash
    'item_name_vector_2', # S,S,S3, trash candidate, all trash
    'item_name_vector_3', # S,3,33, trash candidate, all trash
    #'item_name_tSNE_vector_1', # S,S,S32,
    #'item_name_tSNE_vector_2', # S,S,S32,
    'all_data_PCA_vector_1', # S,S,1131, trash candidate
    'all_data_PCA_vector_2', # S,S,223, trash candidate
    'all_data_PCA_vector_3', # S,3,223, trash candidate, all trash
    'all_data_PCA_vector_4', # S,S,322, trash candidate, all trash
    
    #'shop_price_lag_1', # S,S,S,
    #'category_price_lag_1', # S,S,S,
    #'shop_revenue_lag_1', # S,S,
    #'category_revenue_lag_1', # S,S,S,
    'item_revenue_lag_1', # S,S,2 trash candidate
    
    'count_lag_1', # 1,-,-,1 cheap candidate, old cheap, best cheap subset
    'count_lag_2', # S,-,-,2 cheap candidate, old cheap, chosen cheap
    'count_lag_3', # S,-,-,2 cheap candidate, old cheap, chosen cheap, best cheap subset
    #'count_lag_4', # S,S,2211,
    #'count_lag_5', # S,S,3,3x
    #'count_lag_6', # S,S,3,2
    #'count_lag_7', # S,S,SS33,3
    #'count_lag_8', # S,S,32,2
    #'count_lag_9', # S,S,S,33
    #'count_lag_10', # S,S,S,
    #'count_lag_11', # S,S,S,
    #'count_lag_12', # S,S,S,3
    'count_12_mean', # 3,1,-, cheap candidate, best cheap subset
    'count_6_mean', # 2,-,-, cheap candidate, old cheap
    'count_3_mean', # 1,-,-, cheap candidate, old cheap
    
    'item_count_lag_1', # 3,-,-,21 cheap candidate, old cheap
    #'item_count_lag_2', # S,S,2213,
    #'item_count_lag_3', # S,S,3,3
    #'item_count_lag_4', # 
    #'item_count_lag_5', # 
    'item_count_lag_6', # S,S,SS3, trash candidate
    #'item_count_lag_7', # 
    #'item_count_lag_8', # 
    #'item_count_lag_9', # 
    #'item_count_lag_10', # 
    #'item_count_lag_11', # 
    'item_count_lag_12', # S,S,S,3 trash candidate, all trash
    'item_count_12_mean', # 3,3,S, cheap candidate
    'item_count_6_mean', # 3,S,2212 cheap candidate, best cheap subset
    'item_count_3_mean', # S,2,-,
    
    #'shop_count_lag_1', # S,S,S,
    #'shop_count_lag_2', # S,S,S,
    #'shop_count_lag_3', # S,S,S,
    #'shop_count_lag_4', # 
    #'shop_count_lag_5', # 
    #'shop_count_lag_6', # S,S,S,
    #'shop_count_lag_7', # 
    #'shop_count_lag_8', # 
    #'shop_count_lag_9', # 
    #'shop_count_lag_10', # 
    #'shop_count_lag_11', # 
    'shop_count_lag_12', # S,S,S,32x
    #'shop_count_12_mean', # S,S,S,
    #'shop_count_6_mean', # S,S,S,
    #'shop_count_3_mean', # S,S,S,
    
    'shop_count_per_item_lag_1', # S,S,SSS2, trash candidate
    #'shop_count_per_item_lag_2', # S,S,S,
    #'shop_count_per_item_lag_3', # S,S,S,
    #'shop_count_per_item_lag_4', # 
    #'shop_count_per_item_lag_5', # 
    #'shop_count_per_item_lag_6', # S,S,S,
    #'shop_count_per_item_lag_7', # 
    #'shop_count_per_item_lag_8', # 
    #'shop_count_per_item_lag_9', # 
    #'shop_count_per_item_lag_10', # 
    #'shop_count_per_item_lag_11', # 
    'shop_count_per_item_lag_12', # S,S,33,3
    'shop_count_per_item_12_mean', # S,S,SSS3,3X
    #'shop_count_per_item_6_mean', # S,S,S,
    #'shop_count_per_item_3_mean', # S,S,S,
    
    'category_count_per_item_lag_1', # S,S,3,
    #'category_count_per_item_lag_2', # S,S,SSS2,
    #'category_count_per_item_lag_3', # S,S,S,
    #'category_count_per_item_lag_4', # 
    #'category_count_per_item_lag_5', # 
    #'category_count_per_item_lag_6', # S,S,S,
    #'category_count_per_item_lag_7', # 
    #'category_count_per_item_lag_8', # 
    #'category_count_per_item_lag_9', # 
    #'category_count_per_item_lag_10', # 
    #'category_count_per_item_lag_11', # 
    #'category_count_per_item_lag_12', # S,S,S,
    'category_count_per_item_12_mean', # S,S,S,2x
    'category_count_per_item_6_mean', # S,S,SSS2, trash candidate, all trash
    'category_count_per_item_3_mean', # S,S,SSS2,
    
    'shop_category_count_per_item_lag_1', # S,32,-,2 cheap candidate, best cheap subset
    #'shop_category_count_per_item_lag_2', # S,S,2
    #'shop_category_count_per_item_lag_3', # S,S,S33,
    #'shop_category_count_per_item_lag_4', # 
    #'shop_category_count_per_item_lag_5', # 
    #'shop_category_count_per_item_lag_6', # S,S,S,
    #'shop_category_count_per_item_lag_7', # 
    #'shop_category_count_per_item_lag_8', # 
    #'shop_category_count_per_item_lag_9', # 
    #'shop_category_count_per_item_lag_10', # 
    #'shop_category_count_per_item_lag_11', # 
    #'shop_category_count_per_item_lag_12', # S,S,S,
    'shop_category_count_per_item_12_mean', # S,S,233,
    'shop_category_count_per_item_6_mean', # S,S,2,3 trash candidate, all trash
    'shop_category_count_per_item_3_mean', # S,3,-,2x cheap candidate, best cheap subset
    
    'item_price_lag_1', # S,S,S,3 trash candidate, all trash
    'item_price_lag_2', # S3,S,-,2 trash candidate, all trash
    'item_price_lag_3', # S,S,2,2 trash candidate, all trash
    #'item_price_lag_4', # 
    #'item_price_lag_5', # 
    'item_price_lag_6', # S,S,S,3x trash candidate, all trash
    #'item_price_lag_7', # 
    #'item_price_lag_8', # 
    #'item_price_lag_9', # 
    #'item_price_lag_10', # 
    #'item_price_lag_11', # 
    #'item_price_lag_12', # S,S,S,
    'item_price_12_mean', # 2,2,-, cheap candidate, best cheap subset
    'item_price_6_mean', # S3,3,-,3 cheap candidate
    'item_price_3_mean', # S,S,2,2 
    
    'revenue_lag_1', # S,1,-,3 cheap candidate, chosen cheap, best cheap subset
    #'revenue_lag_2', # S,S,23
    #'revenue_lag_3', # S,S,S,
    #'revenue_lag_4', # 
    #'revenue_lag_5', # 
    #'revenue_lag_6', # S,S,S,
    #'revenue_lag_7', # 
    #'revenue_lag_8', # 
    #'revenue_lag_9', # 
    #'revenue_lag_10', # 
    #'revenue_lag_11', # 
    #'revenue_lag_12', # S,S,S,
    'revenue_12_mean', # S,3,-, thrash candidate, all trash
    #'revenue_6_mean', # S,3,-,
    'revenue_3_mean' # S,2,-,1 cheap candidate
]

trash_candidates = ['shop_city', 'shop_type', 'category', 'larger_category', 'item_price_trend', 'item_name_vector_1', 
                    'item_name_vector_2', 'item_name_vector_3', 'all_data_PCA_vector_1', 'all_data_PCA_vector_2', 
                    'all_data_PCA_vector_3', 'all_data_PCA_vector_4', 'item_revenue_lag_1', 'item_count_lag_6', 
                    'item_count_lag_12', 'shop_count_per_item_lag_1', 'category_count_per_item_6_mean', 
                    'shop_category_count_per_item_6_mean', 'item_price_lag_1', 'item_price_lag_2', 'item_price_lag_3', 
                    'revenue_12_mean'] # 22

cheap_candidates = ['month', 'item_mean_count', 'shop_item_mean_count', 'shop_category_mean_count', 'item_count_trend', 
                    'months_since_first_item_sale', 'months_since_first_sale', 'count_lag_1', 'count_lag_2', 
                    'count_lag_3', 'count_12_mean', 'count_6_mean', 'count_3_mean', 'item_count_lag_1', 
                    'item_count_12_mean', 'item_count_6_mean', 'shop_category_count_per_item_lag_1', 
                    'shop_category_count_per_item_3_mean', 'item_price_12_mean', 'item_price_6_mean', 'revenue_lag_1', 
                    'revenue_3_mean'] # 22 (9 old cheap)

data = pd.read_pickle('data/features_develop_all.p')#[model_features]#.sample(frac=0.001)
print(data.info(max_cols=1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555656 entries, 0 to 4555655
Columns: 47 entries, count to revenue_3_mean
dtypes: float16(23), float32(8), int16(8), int8(8)
memory usage: 443.2 MB
None


In [3]:
presets = [
    
    {
        'model_name' : '',
        'model_class' : XGBRegressor,
        'model_params' : {
            'n_estimators':50
        },
        'fit_params' : {
        }
    }
    
]

In [4]:
### Development model training
results = {'fraction': [], 'removed': [], 'train': [], 'validate': []}

for fraction in [0.7, 0.5]:
    
    for i in range(30):
        
        droppable_features = list(set(data.columns) - set(['count', 'item', 'shop', 'time']))
        
        to_drop = np.random.choice(droppable_features, int(fraction*len(droppable_features)), False)
        subdata = data.drop(to_drop, axis=1)

        # Data split

        train = subdata[subdata['time'] <= LAST_TRAINSET_TIME]
        validate = subdata[(subdata['time'] > LAST_TRAINSET_TIME) & (subdata['time'] < 34)]

        train_X = train.drop('count', axis=1)
        train_y = train['count']
        del train
        validate_X = validate.drop('count', axis=1)
        validate_y = validate['count']
        del validate
        predict = subdata[subdata['time'] == 34]
        predict_X = predict.drop('count', axis=1)
        del predict
        del subdata
    
        for preset in presets:

            model_name = preset['model_name']
            model_class = preset['model_class']
            model_params = preset['model_params']
            show_error_bars = model_class == RandomForestRegressor or model_class == ExtraTreesRegressor

            #print('Model:', model_name)

            # Train

            start = time()
            model = model_class(**model_params)

            if model_class == XGBRegressor:
                fit_params = preset['fit_params']
                eval_set = [(train_X, train_y), (validate_X, validate_y)]
                model.fit(train_X, train_y, eval_set=eval_set, verbose=False, **fit_params)
            else:
                model.fit(train_X, train_y)

            preset['model'] = model
            #print('({} min)'.format(round((time()-start)/60, 1)))

            #if 'n_iter_no_change' in model_params.keys():
                #print('Model contains', len(model.estimators_), 'trees')
            if model_class == XGBRegressor:
                if 'early_stopping_rounds' in fit_params.keys():
                    #print('Model contains', model.best_iteration+1, 'trees')
                    ntree_limit = model.best_ntree_limit
                else:
                    ntree_limit = model_params['n_estimators']

            # Validate

            if model_class == XGBRegressor:
                train_prediction = model.predict(train_X, ntree_limit=ntree_limit)
                validate_prediction = model.predict(validate_X, ntree_limit=ntree_limit)
            else:
                train_prediction = model.predict(train_X)
                validate_prediction = model.predict(validate_X)

            #print('train RMSE: {}'.format(np.sqrt(MSE(train_prediction, train_y))))
            #print('validation RMSE: {}'.format(np.sqrt(MSE(validate_prediction, validate_y))))

            # Analysis

            #plot_importances(model, error=show_error_bars, title=model_name)
            #if model_class == XGBRegressor:
            #    plot_xgboost_importances(model, model_name)

            #if model_class == RandomForestRegressor:
            #    plot_tree_usefulness(model, validate_X, validate_y, model_name)

            #print()

            results['removed'].append(to_drop)
            results['fraction'].append(fraction)
            results['train'].append(np.sqrt(MSE(train_prediction, train_y)))
            results['validate'].append(np.sqrt(MSE(validate_prediction, validate_y)))

results = pd.DataFrame(results)





In [5]:
results['overfit'] = results['validate'] - results['train']
results['overfit_ratio'] = results['validate'] / results['train']
results = results.sort_values(by='overfit_ratio')
results

Unnamed: 0,fraction,removed,train,validate,overfit,overfit_ratio
21,0.7,"[item_count_trend, revenue_3_mean, item_price_...",0.869541,0.875875,0.006334,1.007284
22,0.7,"[item_count_lag_1, item_mean_count, count_lag_...",0.844315,0.853703,0.009388,1.011119
16,0.7,"[revenue_lag_1, count_lag_1, item_count_lag_1,...",0.770595,0.781815,0.01122,1.014561
19,0.7,"[all_data_PCA_vector_1, count_12_mean, count_t...",0.867713,0.880406,0.012693,1.014628
29,0.7,"[item_return_percent, months_since_first_item_...",0.835218,0.847949,0.012731,1.015243
28,0.7,"[shop_category_mean_count, months_since_last_i...",0.825279,0.837924,0.012645,1.015322
49,0.5,"[months_since_first_sale, item_count_3_mean, a...",0.822946,0.836392,0.013446,1.016339
55,0.5,"[item_count_trend, count_lag_3, item_mean_coun...",0.827648,0.842605,0.014957,1.018072
51,0.5,"[all_data_PCA_vector_1, count_lag_3, count_6_m...",0.8147,0.830594,0.015893,1.019508
12,0.7,"[category_mean_count, all_data_PCA_vector_1, c...",0.841027,0.861196,0.020169,1.023981


In [2]:
#[list(arr) for arr in results['removed'][:1]]
#print(sorted(results['removed'][16]))

In [7]:
print('total time:', round((time()-totalstart)/60, 1), 'min')
print('total time:', round((time()-totalstart)/60/60, 1), 'h')

total time: 412.8 min
total time: 6.9 h
