In [43]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
def preprocess(df):
    df.dropna(axis=1, inplace=True)
    df = df.loc[:, df.nunique() > 1].copy()
    return df

In [54]:
import pandas as pd
from src.utils import get_fps_offset, get_fps_cols, OffsetScaler

RANDOM_SEED = 42
N_JOBS = 12

X_train = preprocess(pd.read_csv('../data/processed/X_train.csv'))
cols_to_keep = X_train.columns.tolist()
X_test = pd.read_csv('../data/processed/X_test.csv')[cols_to_keep]
y_train = pd.read_csv('../data/processed/y_train.csv').target

# fps_offset = get_fps_offset(X_train)
fps_cols = get_fps_cols(X_train)
scaler = OffsetScaler(len(fps_cols))

X_train = pd.DataFrame(
    scaler.fit_transform(X_train.values),
    index=X_train.index,
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test.values),
    index=X_test.index,
    columns=X_test.columns
)

In [25]:
import time
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score

def evaluate_single(model, X):
    tic = time.time()
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    cv_res = cross_val_score(model, X, y_train, cv=kfold, scoring=rmse)
    
    toc = time.time()
    final_score = cv_res.mean() - cv_res.std()
    print("%3.3f     %3.3f ± %3.3f      %.1fs" % (final_score, cv_res.mean(), cv_res.std(), toc - tic))
    return cv_res

rmse = 'neg_root_mean_squared_error'

best_params = {
    'rf': {        
        'n_estimators': 200,
        'max_depth': 25,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'bootstrap': True,
        'max_features': 0.2,
        'random_state': RANDOM_SEED,
        'n_jobs': N_JOBS,
    },
    'lasso': {
        'alpha': 1.0531212524084377,
    },
    'knn': {
        'n_neighbors': 10, 
        'weights': 'distance', 
        'algorithm': 'kd_tree',
    },
    'xgb': {
        'n_estimators': 2000,
        'max_depth': 3,
        'learning_rate': 0.006929151892786309,
        'subsample': 0.580956029244851,
        'colsample_bytree': 0.5326676728408477,
        'gamma': 0.638354307456016,
        'reg_alpha': 0.7784317061199424,
        'reg_lambda': 0.8587656947103454,
        'random_state': RANDOM_SEED,
        'n_jobs': N_JOBS,        
    },
    'cb': {
        'n_estimators': 2000,
        'depth': 4,
        'learning_rate': 0.014471212873059244,
        'l2_leaf_reg': 0.40949275910093563,
        'subsample': 0.7134111640862024,
        'colsample_bylevel': 0.7109836464817926,
        'bootstrap_type': 'Bernoulli',
        'verbose': False,
        'random_seed': RANDOM_SEED,
        'thread_count': N_JOBS,        
    },
}

rf = RandomForestRegressor(**best_params['rf'])
cb = CatBoostRegressor(**best_params['cb'])
xgb = XGBRegressor(**best_params['xgb'])

In [26]:
evaluate_single(rf, X_train);
evaluate_single(cb, X_train);
evaluate_single(xgb, X_train);

-25.764     -24.129 ± 1.635      11.4s
-24.981     -23.621 ± 1.360      221.2s
-25.021     -23.623 ± 1.398      163.0s


In [30]:
rf = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)
cb = CatBoostRegressor(random_seed=RANDOM_SEED, thread_count=N_JOBS, verbose=False)
xgb = XGBRegressor(random_state=RANDOM_SEED, n_jobs=N_JOBS)

evaluate_single(rf, X_train);
evaluate_single(cb, X_train);
evaluate_single(xgb, X_train);

-25.853     -24.100 ± 1.753      23.7s
-25.142     -23.736 ± 1.407      274.1s


Parameters: { "random_seed" } are not used.

Parameters: { "random_seed" } are not used.

Parameters: { "random_seed" } are not used.

Parameters: { "random_seed" } are not used.

Parameters: { "random_seed" } are not used.



-26.472     -24.798 ± 1.675      18.3s


In [36]:
cb = CatBoostRegressor(
    iterations=100,
    colsample_bylevel=0.1,
    random_seed=RANDOM_SEED, 
    thread_count=N_JOBS, 
    verbose=False
)

evaluate_single(cb, X_train);

-25.672     -24.447 ± 1.225      16.8s


In [40]:
cb = CatBoostRegressor(
    iterations=100,
    # colsample_bylevel=0.1,
    random_seed=RANDOM_SEED, 
    thread_count=N_JOBS, 
    verbose=False
)

evaluate_single(cb, X_train);

-25.740     -24.713 ± 1.027      26.4s


In [38]:
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7f4fa0fb68f0>

In [39]:
cb.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 100,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 0.10000000149011612,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': False,
 'random_seed': 42,
 'depth': 6,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_functio