In [None]:
import time 
t_script = time.time()

### Load libraries, functions, palette, theme

In [None]:
%run __libraries.ipynb

In [None]:
%run __functions.ipynb

## Models Evaluation

### Load Data

In [None]:
start_date = '2022-07-12'
end_date = '2022-10-15'

In [None]:
data = read_from_files(
    DIR_INPUT = './data-transformed/',
    BEGIN_DATE = start_date,
    END_DATE = end_date
)

In [None]:
data.shape

### Features Selection

In [None]:
features = [
    'tx_amount', 'tx_during_weekend', 'tx_during_night',
    'customer_id_nb_tx_1day_window', 'customer_id_avg_amount_1day_window',
    'customer_id_nb_tx_7day_window', 'customer_id_avg_amount_7day_window',
    'customer_id_nb_tx_30day_window', 'customer_id_avg_amount_30day_window',
    'terminal_id_nb_tx_1day_window', 'terminal_id_risk_1day_window',
    'terminal_id_nb_tx_7day_window', 'terminal_id_risk_7day_window',
    'terminal_id_nb_tx_30day_window', 'terminal_id_risk_30day_window'
]

target = 'tx_fraud'

In [None]:
len(features)

### Primary Parameters

In [None]:
%run __primary_parameters.ipynb

In [None]:
train_start, train_start_valid, train_start_test

### Training and Evaluation Models

In [None]:
hyper_list = []

In [None]:
hyper_dict = {
    'Logistic Regression': {
        'penalty': None,
        'C': None,
        'solver': None,
        'max_iter': 5000,
        'random_state': seed
    },
    'Random Forest': {
        'n_estimators': None, 'max_depth': None,
        'random_state': seed
    },
    'XGBoost': {
        'n_estimators': None, 'max_depth': None, 
        'learning_rate': None, 'random_state': seed
    },
    'LightGBM': {
        'n_estimators': None, 'max_depth': None,
        'learning_rate': None, 'random_state': seed
    }
}

#### Logistic Regression

##### LR Baseline

In [None]:
clf_lr_base = LogisticRegression()

params_lr_base = {
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [0.1, 1, 10],
    'clf__solver': ['saga'],
    'clf__max_iter': [5000],
    'clf__random_state': [seed]
}

preprop_lr_base = [
    ('scaler', StandardScaler())
]

grid_lr_base = grid_create(
    clf_lr_base, params_lr_base, preprop_lr_base,
    search_type='grid', key_params_idxs=[0,1,2]
)

In [None]:
t_perf = time.time()
performance_lr_base = model_performance_CV(data, **grid_lr_base)
exec_time_lr_base = time.time() - t_perf

In [None]:
exec_time_lr_base

In [None]:
summary_lr_base = get_summary_performances(performance_lr_base, metrics)

In [None]:
summary_lr_base

In [None]:
estmators_esolate_lvl_1 = 'l1'

performance_lr_base_plot_estmators = \
    performance_lr_base.loc[performance_lr_base['Parameters summary'] \
        .str.startswith(estmators_esolate_lvl_1)]

In [None]:
plot_performance(
    data=performance_lr_base_plot_estmators,
    metrics=metrics,
    subsets=subsets,
    xlabel='C',
    colors=[palette[1], palette[-1], palette[-1]],
    save=True,
    filename='performance_lr'
)

##### Hyperparameters Dict

In [None]:
best_params_lr = summary_lr_base.loc['Best estimated parameters', 'Average Precision']

In [None]:
best_params_lr

In [None]:
hyper_lr_keys = hyper_dict['Logistic Regression'].keys()

In [None]:
hyper_lr_keys

In [None]:
hyper_lr_params = best_params_lr.split('/')

In [None]:
hyper_lr_params[1] = ast.literal_eval(hyper_lr_params[1])

In [None]:
hyper_lr_params

In [None]:
for key, param in zip(hyper_lr_keys, hyper_lr_params):
    hyper_dict['Logistic Regression'][key] = param

#### Random Forest

##### RF Baseline

In [None]:
clf_rf_base = RandomForestClassifier()

params_rf_base = {
    'clf__n_estimators':[5, 10, 25, 50, 75, 100, 150],
    'clf__max_depth':[10, 25, 50],
    'clf__random_state':[seed]
}

preprop_rf_base = [
    ('scaler', StandardScaler())
]

grid_rf_base = grid_create(
    clf_rf_base, params_rf_base, preprop_rf_base,
    search_type='grid', key_params_idxs=[0,1]
)

In [None]:
t_perf = time.time()
performance_rf_base = model_performance_CV(data, **grid_rf_base)
exec_time_rf_base = time.time() - t_perf

In [None]:
exec_time_rf_base

In [None]:
summary_rf_base = get_summary_performances(performance_rf_base, metrics)

In [None]:
summary_rf_base

In [None]:
estmators_esolate_lvl = '150'

performance_rf_base_plot_estmators = \
    performance_rf_base.loc[performance_rf_base['Parameters summary'] \
        .str.startswith(estmators_esolate_lvl)]

In [None]:
plot_performance(
    data=performance_rf_base_plot_estmators,
    metrics=metrics,
    subsets=subsets,
    xlabel='Estimators / Depth',
    colors=[palette[2], palette[-1], palette[-1]]
)

In [None]:
depth_esolate_lvl = '25'

performance_rf_base_plot_depth = \
    performance_rf_base.loc[performance_rf_base['Parameters summary'] \
        .str.endswith(depth_esolate_lvl)]

In [None]:
plot_performance(
    data=performance_rf_base_plot_depth,
    metrics=metrics,
    subsets=subsets,
    xlabel='Estimators / Depth',
    colors=[palette[2], palette[-1], palette[-1]],
    # rotation=45,
    save=True,
    filename='performance_rf'
)

##### Hyperparameters Dict

In [None]:
best_params_rf = summary_rf_base.loc['Best estimated parameters', 'Average Precision']

In [None]:
best_params_rf

In [None]:
hyper_rf_keys = hyper_dict['Random Forest'].keys()

In [None]:
hyper_rf_params = [ast.literal_eval(i) for i in best_params_rf.split('/')]

In [None]:
for key, param in zip(hyper_rf_keys, hyper_rf_params):
    hyper_dict['Random Forest'][key] = param

#### XGBoost

##### XGB Baseline

In [None]:
clf_xgb_base = XGBClassifier()

params_xgb_base = {
    'clf__n_estimators':[5, 10, 25, 50, 75, 100, 150],
    'clf__max_depth':[3, 6, 9],
    'clf__learning_rate':[0.1, 0.3],
    'clf__random_state':[seed]
}

preprop_xgb_base = [
    ('scaler', StandardScaler())
]

grid_xgb_base = grid_create(
    clf_xgb_base, params_xgb_base, preprop_xgb_base,
    search_type='grid', key_params_idxs=[0,1,2]
)

In [None]:
t_perf = time.time()
performance_xgb_base = model_performance_CV(data, **grid_xgb_base)
exec_time_xgb_base = time.time() - t_perf

In [None]:
exec_time_xgb_base

In [None]:
summary_xgb_base = get_summary_performances(performance_xgb_base, metrics)

In [None]:
summary_xgb_base

In [None]:
estmators_esolate_lvl = '3/0.1'

performance_xgb_base_plot_estmators = \
    performance_xgb_base.loc[performance_xgb_base['Parameters summary'] \
        .str.endswith(estmators_esolate_lvl)]

In [None]:
plot_performance(
    data=performance_xgb_base_plot_estmators,
    metrics=metrics,
    subsets=subsets,
    xlabel='Trees / Learning Rate / Depth',
    colors=[palette[-7], palette[-1], palette[-1]],
    rotation=45,
    save=True,
    filename='performance_xgb'
)

In [None]:
depth_esolate_lvl = '150/3'

performance_xgb_base_plot_depth = \
    performance_xgb_base.loc[performance_xgb_base['Parameters summary'] \
        .str.startswith(depth_esolate_lvl)]

In [None]:
plot_performance(
    data=performance_xgb_base_plot_depth,
    metrics=metrics,
    subsets=subsets,
    xlabel='Trees / Learning Rate / Leafs',
    colors=[palette[-7], palette[-1], palette[-1]],
    rotation=45
)

##### Hyperparameters Dict

In [None]:
best_params_xgb = summary_xgb_base.loc['Best estimated parameters', 'Average Precision']

In [None]:
best_params_xgb

In [None]:
hyper_xgb_keys = hyper_dict['XGBoost'].keys()

In [None]:
hyper_xgb_params = [ast.literal_eval(i) for i in best_params_xgb.split('/')]

In [None]:
for key, param in zip(hyper_xgb_keys, hyper_xgb_params):
    hyper_dict['XGBoost'][key] = param

#### LightGBM

##### LGB Baseline

In [None]:
clf_lgb_base = LGBMClassifier()

params_lgb_base = {
    'clf__n_estimators':[5, 10, 25, 50, 75, 100, 150],
    'clf__max_depth':[3, 6, 9],
    'clf__learning_rate':[0.1, 0.3],
    'clf__random_state':[seed],
    'clf__verbose': [-1]
}

preprop_lgb_base = [
    ('scaler', StandardScaler())
]

grid_lgb_base = grid_create(
    clf_lgb_base, params_lgb_base, preprop_lgb_base,
    search_type='grid', key_params_idxs=[0,1,2]
)

In [None]:
t_perf = time.time()
performance_lgb_base = model_performance_CV(data, **grid_lgb_base)
exec_time_lgb_base = time.time() - t_perf

In [None]:
exec_time_lgb_base

In [None]:
summary_lgb_base = get_summary_performances(performance_lgb_base, metrics)

In [None]:
summary_lgb_base

In [None]:
estmators_esolate_lvl = '150/3'

performance_lgb_base_plot_estmators = \
    performance_lgb_base.loc[performance_lgb_base['Parameters summary'] \
        .str.startswith(estmators_esolate_lvl)]

In [None]:
plot_performance(
    data=performance_lgb_base_plot_estmators,
    metrics=metrics,
    subsets=subsets,
    xlabel='Trees / Learning Rate / Depth',
    colors=[palette[-9], palette[-1], palette[-1]]
)

In [None]:
depth_esolate_lvl = '3/0.1'

performance_lgb_base_plot_depth = \
    performance_lgb_base.loc[performance_lgb_base['Parameters summary'] \
        .str.endswith(depth_esolate_lvl)]

In [None]:
plot_performance(
    data=performance_lgb_base_plot_depth,
    metrics=metrics,
    subsets=subsets,
    xlabel='Trees / Learning Rate / Leafs',
    colors=[palette[-9], palette[-1], palette[-1]],
    rotation=45,
    save=True,
    filename='performance_lgb'
)

##### Hyperparameters Dict

In [None]:
best_params_lgb = summary_lgb_base.loc['Best estimated parameters', 'Average Precision']

In [None]:
best_params_lgb

In [None]:
hyper_lgb_keys = hyper_dict['LightGBM'].keys()

In [None]:
hyper_lgb_params = [ast.literal_eval(i) for i in best_params_lgb.split('/')]

In [None]:
for key, param in zip(hyper_lgb_keys, hyper_lgb_params):
    hyper_dict['LightGBM'][key] = param

### Save Results

In [None]:
performance_dict = {
    "Logistic Regression": performance_lr_base,
    "Random Forest": performance_rf_base,
    "XGBoost": performance_xgb_base,
    'LightGBM': performance_lgb_base
}

In [None]:
filehandler = open('_performance_base.pkl', 'wb') 
pickle.dump(performance_dict, filehandler)
filehandler.close()

In [None]:
summary_dict = {
    "Logistic Regression": summary_lr_base,
    "Random Forest": summary_rf_base,
    "XGBoost": summary_xgb_base,
    'LightGBM': summary_lgb_base
}

In [None]:
filehandler = open('_summary_base.pkl', 'wb') 
pickle.dump(summary_dict, filehandler)
filehandler.close()

In [None]:
hyper_dict

In [None]:
filehandler = open('_hyperparameters_base.pkl', 'wb') 
pickle.dump(hyper_dict, filehandler)
filehandler.close()

In [None]:
time_exec_dict = {
    "Logistic Regression": exec_time_lr_base,
    "Random Forest": exec_time_rf_base,
    "XGBoost": exec_time_xgb_base,
    'LightGBM': exec_time_lgb_base
}

In [None]:
filehandler = open('_time_exec_base.pkl', 'wb') 
pickle.dump(time_exec_dict, filehandler)
filehandler.close()

In [None]:
e_time = str(dt.timedelta(seconds=np.round(time.time() - t_script)))
print('Execution time: {}'.format(e_time))