In [1]:
LAST_TRAINSET_TIME = 29

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
totalstart=time()
from sklearn.metrics import mean_squared_error as MSE

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import ParameterGrid

pd.set_option('display.max_columns', 100)

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('data/features_develop_minus_cheaps.p').sample(frac=0.001)
print(data.info(max_cols=1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4555656 entries, 0 to 4555655
Columns: 47 entries, count to revenue_3_mean
dtypes: float16(23), float32(9), int16(6), int8(9)
memory usage: 447.5 MB
None


```
- all XGB
baseline 0.830 (0.621) (104 trees)
depth, row+col sample baseline 0.826 (0.577) (52 trees)
lr 0.05 0.820 (0.654)
lr 0.02 0.822 (0.628) (421 trees)
fixed lr, depth, min_child_weight (0.817) (0.692) 0.124591 1.179991
tuned 0.720054	0.783156	0.063102	1.087636

params = {
    'n_jobs': [-1],
    'objective': ['reg:squarederror'],
    'seed': [123],

    'n_estimators': [1000],
    'early_stopping_rounds': [50],
    
    'subsample': [0.7],
    'colsample_bytree': [0.15],
    
    'learning_rate': [0.1],
    'max_depth': [2],
    'min_child_weight': [50]
}

- cheaps XGB
baseline 127	0.684306	0.748132	0.063827	1.093272

params = {
    'n_jobs': [-1],
    'objective': ['reg:squarederror'],
    'seed': [123],
    
    'n_estimators': [10000],
    'early_stopping_rounds': [50],
    
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    
    'learning_rate': [0.1],
    'max_depth': [4],
    'min_child_weight': [50]
}

- cheaps GBR

params = {
    'verbose': [1],
    'random_state': [123],
    
    'n_estimators': [500],
    
    'subsample': [0.9],
    'max_features': [0.85],
    
    'learning_rate': [0.3],
    'max_depth': [4],
    'min_samples_leaf': [750]
}

- all GBR

params = {
    'verbose': [1],
    'random_state': [123],
    
    'n_estimators': [150],
    'n_iter_no_change': [20],
    'tol': [0.005],
    
    'subsample': [0.7],
    'max_features': [0.1],
    
    'learning_rate': [0.3],
    'max_depth': [2],
    'min_samples_leaf': [500]
}

- all RF

params = {
    'n_jobs': [-1],
    'random_state': [123],
    'n_estimators': [150],
    
    'max_features': [0.15],
    'max_depth': [13],
    'min_samples_leaf': [1000]
}

- cheaps RF

params = {
    'n_jobs': [-1],
    'random_state': [123],
    'n_estimators': [150],
    
    'max_features': [0.25],
    'max_depth': [11],
    'min_samples_leaf': [500]
}

```

In [3]:
train = data[data['time'] <= LAST_TRAINSET_TIME]
validate = data[(data['time'] > LAST_TRAINSET_TIME) & (data['time'] < 34)]

train_X = train.drop('count', axis=1)
train_y = train['count']
del train
validate_X = validate.drop('count', axis=1)
validate_y = validate['count']
del validate
predict = data[data['time'] == 34]
predict_X = predict.drop('count', axis=1)
del predict
del data

model_class = RandomForestRegressor
params = {
    'n_jobs': [-1],
    'random_state': [123],
    'n_estimators': [150],
    
    'max_features': [0.25],
    'max_depth': [11],
    'min_samples_leaf': [500]
}

results = {}
for key in params.keys():
    results[key] = []
results['train'] = []
results['validate'] = []

n = 0

for combo in ParameterGrid(params):
    n += 1
    print(n)
    
    model_params = {
        'n_jobs': combo['n_jobs'],
        'random_state': combo['random_state'],
        'n_estimators': combo['n_estimators'],
        
        'max_features': combo['max_features'],
        'max_depth': combo['max_depth'],
        'min_samples_leaf': combo['min_samples_leaf']
    }

    start = time()
    model = model_class(**model_params)

    if model_class == XGBRegressor:
        fit_params = {
            'early_stopping_rounds': combo['early_stopping_rounds']
        }
        
        eval_set = [(train_X, train_y), (validate_X, validate_y)]
        model.fit(train_X, train_y, eval_set=eval_set, **fit_params)
    else:
        model.fit(train_X, train_y)

    if model_class == XGBRegressor:
        if 'early_stopping_rounds' in fit_params.keys() or 'n_estimators' not in model_params.keys():
            ntree_limit = model.best_ntree_limit
        else:
            ntree_limit = model_params['n_estimators']
        results['trees'].append(ntree_limit)
    if model_class == GradientBoostingRegressor:
        results['trees'].append(len(model.train_score_))

    if model_class == XGBRegressor:
        train_prediction = model.predict(train_X, ntree_limit=ntree_limit)
        validate_prediction = model.predict(validate_X, ntree_limit=ntree_limit)
    else:
        train_prediction = model.predict(train_X)
        validate_prediction = model.predict(validate_X)

    for key in params.keys():
        results[key].append(combo[key])

    results['train'].append(np.sqrt(MSE(train_prediction, train_y)))
    results['validate'].append(np.sqrt(MSE(validate_prediction, validate_y)))

results = pd.DataFrame(results)

1
2
3
4


In [4]:
results = results.sort_values(by='validate')
results['overfit'] = results['validate'] - results['train']
results['overfit_ratio'] = results['validate'] / results['train']
results

Unnamed: 0,n_jobs,random_state,n_estimators,max_features,max_depth,min_samples_leaf,train,validate,overfit,overfit_ratio
1,-1,123,150,0.25,11,500,0.706916,0.740716,0.033799,1.047812
2,-1,123,150,0.3,11,500,0.703671,0.741084,0.037413,1.053168
0,-1,123,150,0.2,11,500,0.711375,0.74179,0.030415,1.042755
3,-1,123,150,0.35,11,500,0.701583,0.742957,0.041374,1.058972


In [5]:
print('total time:', round((time()-totalstart)/60, 1), 'min')
print('total time:', round((time()-totalstart)/60/60, 1), 'h')

total time: 83.3 min
total time: 1.4 h
