In [None]:
# %% [code]
PARQUETS_DIR = f'/kaggle/input/parquets/'
MODELS_DIR = f'/kaggle/input/riiid-answer-correctness-prediction-models/'

OUT_DIR = '/kaggle/working/'

# %% [code]
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from shutil import copyfile
import stacking_classifiers as sc
from sklearn.ensemble import RandomForestClassifier

# %% [code]
def get_data():
    data = pd.read_parquet(PARQUETS_DIR + 'train_df.parquet', columns=FEATURES+[TARGET])
    train, valid = split_train_valid(data, val_size=1_000_000).values()
    
    return train, valid

# %% [code]
def split_train_valid(dt, val_size):       
    val = dt.iloc[-val_size:]
    trn = dt.iloc[:-val_size]
    xtrn, ytrn = trn.drop(columns=[TARGET]), trn[TARGET]
    xval, yval = val.drop(columns=[TARGET]), val[TARGET]
    
    return {'trn': {'x': xtrn, 'y': ytrn},
            'val': {'x': xval, 'y': yval}}

# %% [code]
def optimize(clf, label, init_points=10, cont_opt=False, version='v1', **kwargs):
    opt_filename = f'{label}_opt.json'
    if cont_opt and not os.path.exists(OUT_DIR+opt_filename):
        copyfile(MODELS_DIR+f'{label}/{version}/'+opt_filename, OUT_DIR+opt_filename)

    train, valid = get_data()
    scorer = sc.get_scorer(clf, train, valid, fixed_params, opt_dtypes)
    optimizer = sc.get_optimizer(scorer,
                                 params_grid=opt_grid,
                                 probes=probes,
                                 cont_opt=cont_opt,
                                 filename=OUT_DIR+opt_filename,
                                 **kwargs)
    optimizer.maximize(init_points=init_points, n_iter=200, alpha=1e-6)

# %% [code]
TARGET = 'answered_correctly'
FEATURES = [
#     'user_id',
#     'content_id',
    'prior_question_elapsed_time',
#     'prior_question_had_explanation',
    'part',
#     'tag1',
#     'tag2',
#     'tag3',
#     'tag4',
#     'tag5',
#     'tag6',
    'user_id_count',
    'user_id_mean',
    'user_id_attempts',
    'content_id_count',
    'content_id_mean',
    'tag_count_0',
    'tag_count_1',
    'tag_count_2',
    'tag_count_3',
#     'tag_count_4',
#     'tag_count_5',
    'tag_mean_0',
    'tag_mean_1',
    'tag_mean_2',
    'tag_mean_3',
#     'tag_mean_4',
#     'tag_mean_5',
    'user_id_tag_count_0',
    'user_id_tag_count_1',
    'user_id_tag_count_2',
    'user_id_tag_count_3',
#     'user_id_tag_count_4',
#     'user_id_tag_count_5',
    'user_id_tag_mean_0',
    'user_id_tag_mean_1',
    'user_id_tag_mean_2',
    'user_id_tag_mean_3',
#     'user_id_tag_mean_4',
#     'user_id_tag_mean_5',
    'user_content_hmean',
#     'tags_hmean',
    'tags_whmean',
#     'user_tags_hmean'
    'user_tags_whmean'
]

CAT_FEATURES = [
    'part'
]

# %% [markdown]
# # Hyperparameter Optimization

# %% [markdown]
# ## LGBM

# %% [code]
# Fixed LGBM parameters:
fixed_params = {
    'num_iterations' : 100,
    'metric': 'auc',
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'is_unbalance': True,
    'num_threads' : 2,
    'verbose': 0
}

opt_grid = {
    'learning_rate': (0.001, 0.2),
    'feature_fraction': (0.5, 0.9),
    'lambda_l2': (0, 5),
    'num_leaves': (50, 1500),
    'min_data_in_leaf': (10, 2000)
}

opt_dtypes = {
    'learning_rate': float,
    'feature_fraction': float,
    'lambda_l2': float,
    'num_leaves': int,
    'min_data_in_leaf': int
}

probes = [
    {"feature_fraction": 0.689624417022823,
     "lambda_l2": 4.450308176307347,
     "learning_rate": 0.12494856029217047,
     "min_data_in_leaf": 1513,
     "num_leaves": 741}
]

# %% [code]
if False:
    optimize(clf=sc.LGBMClassifier,
             label='lgbm',
             init_points=10,
             cont_opt=True,
             use_transformer=False)

# %% [markdown]
# ## XGBoost

# %% [code]
# Fixed LGBM parameters:
fixed_params = {
    'nrounds' : 50,
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'tree_method': 'exact',
    'num_parallel_tree': 1,
    'verbosity': 0
}

opt_grid = {
    'learning_rate': (0.001, 0.2),
    'colsample_bytree': (0.5, 0.9),
    'subsample': (0.5, 1),
    'lambda': (1, 100),
    'max_depth': (5, 7.99),
    'min_child_weight': (1, 2000)
}

opt_dtypes = {
    'learning_rate': float,
    'colsample_bytree': float,
    'subsample': float,
    'lambda': float,
    'max_depth': int,
    'min_child_weight': int
}

probes = [
    {"learning_rate": 0.12494856029217047,
     "colsample_bytree": 0.689624417022823,
     "subsample": 1,
     "lambda": 4.450308176307347,
     "max_depth": 6,
     "min_child_weight": 1513},
    {"learning_rate": 0.18,
     "colsample_bytree": 0.5014,
     "subsample": 0.7202,
     "lambda": 1.654,
     "max_depth": 6,
     "min_child_weight": 348}
]

# %% [code]
if False:
    optimize(clf=sc.XGBClassifier,
             label='xgb',
             init_points=10,
             cont_opt=True,
             use_transformer=False,
             version='v1')

# %% [markdown]
# ## CatBoost

# %% [code]
# Fixed CatBoost parameters:
fixed_params = {
    'iterations' : 50,
    'bootstrap_type': 'Bernoulli',
    'eval_metric': 'AUC',
    'grow_policy': 'Lossguide', 
    'allow_writing_files': False,
    'od_type': 'Iter',
    'auto_class_weights': 'Balanced',
    'use_best_model': True
}

opt_grid = {
    'learning_rate': (0.001, 0.2),
    'depth': (8, 16.99),
    'l2_leaf_reg': (1, 100),
    'colsample_bylevel': (0.5, 1),
    'subsample': (0.6, 1),
    'min_data_in_leaf': (1, 2000),
    'max_leaves': (50, 2000)
}

opt_dtypes = {
    'learning_rate': float,
    'depth': int,
    'l2_leaf_reg': float,
    'colsample_bylevel': float,
    'subsample': float,
    'min_data_in_leaf': int,
    'max_leaves': int
}

probes = [
    {"learning_rate": 0.12494856029217047,
     "colsample_bylevel": 0.689624417022823,
     "subsample": 0.9,
     "l2_leaf_reg": 4.450308176307347,
     "depth": 6,
     "min_data_in_leaf": 1513,
     "max_leaves": 741},
]

# %% [code]
if False:
    optimize(clf=sc.CATBClassifier,
             label='catb',
             init_points=10,
             cont_opt=True,
             use_transformer=False,
             version='v1')

# %% [markdown]
# ## Random Forest

# %% [code]
# Fixed Random Forest parameters:
fixed_params = {
    'n_jobs': 2,
    'clf_type': RandomForestClassifier
}

opt_grid = {
    'max_depth': (6, 16.99),
    'n_estimators': (10, 1000),
    'max_features': (0.5, 1),
    'max_samples': (0.5, 1),
    'min_samples_leaf': (0.05, 0.3),
    'min_samples_split': (0.05, 0.3),
}

opt_dtypes = {
    'max_depth': int,
    'n_estimators': int,
    'max_features': float,
    'max_samples': float,
    'min_samples_leaf': float,
    'min_samples_split': float,
}

probes = [
    {"max_depth": 6,
     "n_estimators": 200,
     "max_features": 0.9,
     "max_samples": 0.8,
     "min_samples_leaf": 0.1,
     "min_samples_split": 0.1}
]

# %% [code]
if 1:
    optimize(clf=sc.SklearnClassifier,
             label='rf',
             init_points=25,
             cont_opt=0,
             use_transformer=False,
             version='v1')

# %% [code]
train, valid = get_data()
params = probes[0]
params.update(fixed_params)
clf = sc.SklearnClassifier(params)
clf.train(train, valid, True)

# %% [code]
clf.best_score

# %% [markdown]
# ## Neural Network

# %% [code]


# %% [code]


# %% [code]
# lgbm_model = LGBMClassifier(lgbm_params)
# xgb_model = XGBClassifier(xgb_params)
# base_model = LogisticRegressionClassifier()
# model = StackingClassifier(classifiers=[lgbm_model, xgb_model],
#                            base_classifier = base_model,
#                            nfolds=5)

# data = pd.read_parquet(parquets_dir + 'train_df.parquet', columns=FEATURES+[TARGET])
# train_data, valid_data = split_train_valid(data, val_size=1_000_000).values()
# del data
# gc.collect()

# model.train(train_data['x'], train_data['y'])

# %% [code]
# model.score(valid_data['x'], valid_data['y'])

# %% [code]
