In [1]:
import time 
t_script = time.time()

### Load libraries, functions, palette, theme

In [2]:
%run __libraries.ipynb

In [3]:
%run __functions.ipynb

## Unbalanced Techniques

### Load Data

In [4]:
start_date = '2022-07-16'
end_date = '2022-10-19'

In [5]:
data = read_from_files(
    DIR_INPUT = './data-transformed/',
    BEGIN_DATE = start_date,
    END_DATE = end_date
)

### Features Selection

In [6]:
features = [
    'tx_amount', 'tx_during_weekend', 'tx_during_night',
    'customer_id_nb_tx_1day_window', 'customer_id_avg_amount_1day_window',
    'customer_id_nb_tx_7day_window', 'customer_id_avg_amount_7day_window',
    'customer_id_nb_tx_30day_window', 'customer_id_avg_amount_30day_window',
    'terminal_id_nb_tx_1day_window', 'terminal_id_risk_1day_window',
    'terminal_id_nb_tx_7day_window', 'terminal_id_risk_7day_window',
    'terminal_id_nb_tx_30day_window', 'terminal_id_risk_30day_window'
]

target = 'tx_fraud'

### Primary Parameters

In [7]:
%run __primary_parameters.ipynb

In [8]:
models_list = ['Logistic Regression', 'Random Forest', 'XGBoost', 'LightGBM']

### Hyperparameters

In [9]:
hyper_dict = pd.read_pickle(r'_hyperparameters_base.pkl')

In [10]:
params_lr = extract_hyperparams(hyper_dict, 'Logistic Regression', kind='list')
params_rf = extract_hyperparams(hyper_dict, 'Random Forest', kind='list')
params_xgb = extract_hyperparams(hyper_dict, 'XGBoost', kind='list')
params_lgb = extract_hyperparams(hyper_dict, 'LightGBM', kind='list')

### Training and Evaluation Models

#### Logistic Regression

##### LR RUS

In [11]:
clf_lr_rus = LogisticRegression()

params_lr_rus = {f'clf__{i}': j for i, j in params_lr.items()}
params_lr_rus.update(
    {
        'rus__sampling_strategy':[0.01, 0.05, 0.1, 0.5, 1]
    }
)

preprop_lr_rus = [
    ('scaler', StandardScaler()),
    ('rus', RandomUnderSampler(sampling_strategy=0.5,random_state=seed))
]

grid_lr_rus = grid_create(
    clf_lr_rus, params_lr_rus, preprop_lr_rus,
    search_type='grid', key_params_idxs=[0,1]
)

In [12]:
t_perf = time.time()
performance_lr_rus = model_performance_CV(data, **grid_lr_rus)
exec_time_lr_rus = time.time() - t_perf

In [13]:
exec_time_lr_rus

83.76973700523376

In [14]:
summary_lr_rus = get_summary_performances(performance_lr_rus, metrics)

In [15]:
summary_lr_rus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,l2/1,l2/1,l2/1
Validation performance,0.880+/-0.010,0.635+/-0.023,0.315+/-0.015
Test performance,0.875+/-0.018,0.614+/-0.028,0.294+/-0.018
Optimal parameter(s),l2/1,l2/1,l2/1
Optimal test performance,0.878+/-0.017,0.614+/-0.028,0.298+/-0.016


##### LR SMOTE+RUS

In [16]:
clf_lr_smrus = LogisticRegression()

params_lr_smrus = {f'clf__{i}': j for i, j in params_lr.items()}
params_lr_smrus.update(
    {
        'clf__random_state':[seed],
        'smote__sampling_strategy':[0.1, 0.5],
        'rus__sampling_strategy':[0.5, 1],
        'smote__random_state':[seed],
        'rus__random_state':[seed]
    }
)

preprop_lr_smrus = [
    ('scaler', StandardScaler()),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=seed)),
    ('rus', RandomUnderSampler(sampling_strategy=1.0,random_state=seed))
]

grid_lr_smrus = grid_create(
    clf_lr_smrus, params_lr_smrus, preprop_lr_smrus,
    search_type='grid', key_params_idxs=[0,2,3]
)

In [17]:
t_perf = time.time()
performance_lr_smrus = model_performance_CV(data, **grid_lr_smrus)
exec_time_lr_smrus = time.time() - t_perf

In [18]:
exec_time_lr_smrus

32.21787905693054

In [19]:
summary_lr_smrus = get_summary_performances(performance_lr_smrus, metrics)

In [20]:
summary_lr_smrus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,l2/saga/5000,l2/saga/5000,l2/saga/5000
Validation performance,0.879+/-0.012,0.560+/-0.072,0.311+/-0.015
Test performance,0.879+/-0.017,0.578+/-0.021,0.290+/-0.018
Optimal parameter(s),l2/saga/5000,l2/saga/5000,l2/saga/5000
Optimal test performance,0.880+/-0.017,0.584+/-0.024,0.291+/-0.018


##### LR Weighted

In [21]:
clf_lr_weight = LogisticRegression()

params_lr_weight = {f'clf__{i}': j for i, j in params_lr.items()}
params_lr_weight.update(
    {
        'clf__class_weight': [{0: w} for w in [ir, 0.1, 0.5, 1]],
        'clf__random_state':[seed]
    }
)

preprop_lr_weight = [
    ('scaler', StandardScaler()),
]

grid_lr_weight = grid_create(
    clf_lr_weight, params_lr_weight, preprop_lr_weight,
    search_type='grid', key_params_idxs=[0,1]
)

In [22]:
t_perf = time.time()
performance_lr_weight = model_performance_CV(data, **grid_lr_weight)
exec_time_lr_weight = time.time() - t_perf

In [23]:
exec_time_lr_weight

163.10336995124817

In [24]:
summary_lr_weight = get_summary_performances(performance_lr_weight, metrics)

In [25]:
summary_lr_weight

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,l2/1,l2/1,l2/1
Validation performance,0.878+/-0.010,0.635+/-0.022,0.314+/-0.014
Test performance,0.879+/-0.016,0.614+/-0.028,0.295+/-0.016
Optimal parameter(s),l2/1,l2/1,l2/1
Optimal test performance,0.879+/-0.016,0.614+/-0.028,0.298+/-0.016


#### Random Forest

**RF RUS**

In [26]:
clf_rf_rus = RandomForestClassifier()

params_rf_rus = {f'clf__{i}': j for i, j in params_rf.items()}
params_rf_rus.update(
    {
        'clf__random_state':[seed],
        'rus__sampling_strategy':[0.01, 0.05, 0.1, 0.5, 1]
    }
)

preprop_rf_rus = [
    ('scaler', StandardScaler()),
    ('rus', RandomUnderSampler(sampling_strategy=0.5,random_state=seed))
]

grid_rf_rus = grid_create(
    clf_rf_rus, params_rf_rus, preprop_rf_rus,
    search_type='grid', key_params_idxs=[0,1,3]
)

In [27]:
t_perf = time.time()
performance_rf_rus = model_performance_CV(data, **grid_rf_rus)
exec_time_rf_rus = time.time() - t_perf

In [28]:
exec_time_rf_rus

360.24807691574097

In [29]:
summary_rf_rus = get_summary_performances(performance_rf_rus, metrics)

In [30]:
summary_rf_rus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/25/0.1,150/25/0.01,150/25/0.01
Validation performance,0.885+/-0.012,0.700+/-0.012,0.322+/-0.018
Test performance,0.872+/-0.021,0.678+/-0.034,0.306+/-0.018
Optimal parameter(s),150/25/0.01,150/25/0.01,150/25/0.1
Optimal test performance,0.874+/-0.024,0.678+/-0.034,0.307+/-0.019


##### RF SMOTE+RUS

In [31]:
clf_rf_smrus = RandomForestClassifier()

params_rf_smrus = {f'clf__{i}': j for i, j in params_rf.items()}
params_rf_smrus.update(
    {
        'clf__random_state':[seed],
        'smote__sampling_strategy':[0.1, 0.5],
        'rus__sampling_strategy':[0.5, 1],
        'smote__random_state':[seed],
        'rus__random_state':[seed]
    }
)

preprop_rf_smrus = [
    ('scaler', StandardScaler()),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=seed)),
    ('rus', RandomUnderSampler(sampling_strategy=1.0,random_state=seed))
]

grid_rf_smrus = grid_create(
    clf_rf_smrus, params_rf_smrus, preprop_rf_smrus,
    search_type='grid', key_params_idxs=[0,1,3,4]
)

In [32]:
t_perf = time.time()
performance_rf_smrus = model_performance_CV(data, **grid_rf_smrus)
exec_time_rf_smrus = time.time() - t_perf

In [33]:
exec_time_rf_smrus

584.8601472377777

In [34]:
summary_rf_smrus = get_summary_performances(performance_rf_smrus, metrics)

In [35]:
summary_rf_smrus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/25/0.1/0.5,150/25/0.5/0.5,150/25/0.1/0.5
Validation performance,0.880+/-0.012,0.659+/-0.023,0.318+/-0.017
Test performance,0.875+/-0.018,0.627+/-0.034,0.302+/-0.018
Optimal parameter(s),150/25/0.1/0.5,150/25/0.5/0.5,150/25/0.1/0.5
Optimal test performance,0.875+/-0.018,0.627+/-0.034,0.302+/-0.018


##### RF Weighted

In [36]:
clf_rf_weight = RandomForestClassifier()

params_rf_weight = {f'clf__{i}': j for i, j in params_rf.items()}
params_rf_weight.update(
    {
        'clf__class_weight': [{0: w} for w in [ir, 0.1, 0.5, 1]],
        'clf__random_state':[seed]
    }
)

preprop_rf_weight = [
    ('scaler', StandardScaler()),
]

grid_rf_weight = grid_create(
    clf_rf_weight, params_rf_weight, preprop_rf_weight,
    search_type='grid', key_params_idxs=[0,1,2]
)

In [37]:
t_perf = time.time()
performance_rf_weight = model_performance_CV(data, **grid_rf_weight)
exec_time_rf_weight = time.time() - t_perf

In [38]:
exec_time_rf_weight

715.435553073883

In [39]:
summary_rf_weight = get_summary_performances(performance_rf_weight, metrics)

In [40]:
summary_rf_weight

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/25/42,150/25/42,150/25/42
Validation performance,0.874+/-0.006,0.701+/-0.013,0.321+/-0.018
Test performance,0.872+/-0.019,0.678+/-0.029,0.305+/-0.017
Optimal parameter(s),150/25/42,150/25/42,150/25/42
Optimal test performance,0.878+/-0.018,0.680+/-0.030,0.307+/-0.019


#### XGBoost

##### XGB RUS

In [41]:
clf_xgb_rus = XGBClassifier()
params_xgb_rus = {f'clf__{i}': j for i, j in params_xgb.items()}
params_xgb_rus.update(
    {
        'rus__sampling_strategy':[0.01, 0.05, 0.1, 0.5, 1],
        'clf__random_state':[seed]
    }
)

preprop_xgb_rus = [
    ('scaler', StandardScaler()),
    ('rus', RandomUnderSampler(sampling_strategy=0.5,random_state=seed))
]

grid_xgb_rus = grid_create(
    clf_xgb_rus, params_xgb_rus, preprop_xgb_rus,
    search_type='grid', key_params_idxs=[0,1,2,3]
)

In [42]:
t_perf = time.time()
performance_xgb_rus = model_performance_CV(data, **grid_xgb_rus)
exec_time_xgb_rus = time.time() - t_perf

In [43]:
exec_time_xgb_rus

123.82899212837219

In [44]:
summary_xgb_rus = get_summary_performances(performance_xgb_rus, metrics)

In [45]:
summary_xgb_rus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/3/0.1/42,150/3/0.1/42,150/3/0.1/42
Validation performance,0.875+/-0.009,0.699+/-0.010,0.321+/-0.017
Test performance,0.875+/-0.014,0.692+/-0.030,0.307+/-0.018
Optimal parameter(s),150/3/0.1/42,150/3/0.1/42,150/3/0.1/42
Optimal test performance,0.876+/-0.013,0.692+/-0.030,0.307+/-0.018


##### XGB SMOTE+RUS

In [46]:
clf_xgb_smrus = XGBClassifier()
params_xgb_smrus = {f'clf__{i}': j for i, j in params_xgb.items()}
params_xgb_smrus.update(
    {
        'clf__random_state':[seed],
        'smote__sampling_strategy':[0.1, 0.5],
        'rus__sampling_strategy':[0.5, 1],
        'smote__random_state':[seed],
        'rus__random_state':[seed]
    }
)

preprop_xgb_smrus = [
    ('scaler', StandardScaler()),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=seed)),
    ('rus', RandomUnderSampler(sampling_strategy=1.0,random_state=seed))
]

grid_xgb_smrus = grid_create(
    clf_xgb_smrus, params_xgb_smrus, preprop_xgb_smrus,
    search_type='grid', key_params_idxs=[0,1,2,4,5]
)

In [47]:
t_perf = time.time()
performance_xgb_smrus = model_performance_CV(data, **grid_xgb_smrus)
exec_time_xgb_smrus = time.time() - t_perf

In [48]:
exec_time_xgb_smrus

315.2075688838959

In [49]:
summary_xgb_smrus = get_summary_performances(performance_xgb_smrus, metrics)

In [50]:
summary_xgb_smrus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/3/0.1/0.1/1,150/3/0.1/0.5/0.5,150/3/0.1/0.1/0.5
Validation performance,0.862+/-0.013,0.667+/-0.007,0.319+/-0.017
Test performance,0.863+/-0.020,0.640+/-0.022,0.303+/-0.017
Optimal parameter(s),150/3/0.1/0.1/0.5,150/3/0.1/0.1/0.5,150/3/0.1/0.1/0.5
Optimal test performance,0.866+/-0.019,0.645+/-0.027,0.303+/-0.017


##### XGB Weighted

In [51]:
clf_xgb_weight = XGBClassifier()

params_xgb_weight = {f'clf__{i}': j for i, j in params_xgb.items()}
params_xgb_weight.update(
    {
        'clf__scale_pos_weight': [5, 10, 50, 1/ir],
        'clf__random_state':[seed]
    }
)

preprop_xgb_weight = [
    ('scaler', StandardScaler()),
]

grid_xgb_weight = grid_create(
    clf_xgb_weight, params_xgb_weight, preprop_xgb_weight,
    search_type='grid', key_params_idxs=[0,1,2,3]
)

In [52]:
t_perf = time.time()
performance_xgb_weight = model_performance_CV(data, **grid_xgb_weight)
exec_time_xgb_weight = time.time() - t_perf

In [53]:
exec_time_xgb_weight

307.38476276397705

In [54]:
summary_xgb_weight = get_summary_performances(performance_xgb_weight, metrics)

In [55]:
summary_xgb_weight

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/3/0.1/42,150/3/0.1/42,150/3/0.1/42
Validation performance,0.873+/-0.008,0.697+/-0.010,0.320+/-0.017
Test performance,0.877+/-0.014,0.687+/-0.031,0.305+/-0.016
Optimal parameter(s),150/3/0.1/42,150/3/0.1/42,150/3/0.1/42
Optimal test performance,0.877+/-0.014,0.687+/-0.031,0.307+/-0.017


#### LightGBM

##### LGB RUS

In [56]:
clf_lgb_rus = LGBMClassifier()

params_lgb_rus = {f'clf__{i}': j for i, j in params_lgb.items()}
params_lgb_rus.update(
    {
        'clf__random_state':[seed],
        'clf__verbose': [-1],
        'rus__sampling_strategy':[0.01, 0.05, 0.1, 0.5, 1]
    }
)

preprop_lgb_rus = [
    ('scaler', StandardScaler()),
    ('rus', RandomUnderSampler(sampling_strategy=0.5,random_state=seed))
]

grid_lgb_rus = grid_create(
    clf_lgb_rus, params_lgb_rus, preprop_lgb_rus,
    search_type='grid', key_params_idxs=[0,1,2,5]
)

In [57]:
t_perf = time.time()
performance_lgb_rus = model_performance_CV(data, **grid_lgb_rus)
exec_time_lgb_rus = time.time() - t_perf

In [58]:
exec_time_lgb_rus

44.11504626274109

In [59]:
summary_lgb_rus = get_summary_performances(performance_lgb_rus, metrics)

In [60]:
summary_lgb_rus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/3/0.1/0.5,150/3/0.1/0.05,150/3/0.1/0.01
Validation performance,0.873+/-0.007,0.675+/-0.021,0.320+/-0.015
Test performance,0.872+/-0.015,0.670+/-0.037,0.303+/-0.015
Optimal parameter(s),150/3/0.1/0.1,150/3/0.1/0.05,150/3/0.1/0.05
Optimal test performance,0.877+/-0.014,0.670+/-0.037,0.306+/-0.015


##### LGB SMOTE+RUS

In [61]:
clf_lgb_smrus = LGBMClassifier()

params_lgb_smrus = {f'clf__{i}': j for i, j in params_lgb.items()}
params_lgb_smrus.update(
    {
        'clf__random_state':[seed],
        'clf__verbose': [-1],
        'smote__sampling_strategy':[0.1, 0.5],
        'rus__sampling_strategy':[0.5, 1],
        'smote__random_state':[seed],
        'rus__random_state':[seed]
    }
)

preprop_lgb_smrus = [
    ('scaler', StandardScaler()),
    ('smote', SMOTE(sampling_strategy=0.5, random_state=seed)),
    ('rus', RandomUnderSampler(sampling_strategy=1.0,random_state=seed))
]


grid_lgb_smrus = grid_create(
    clf_lgb_smrus, params_lgb_smrus, preprop_lgb_smrus,
    search_type='grid', key_params_idxs=[0,1,2,5,6]
)

In [62]:
t_perf = time.time()
performance_lgb_smrus = model_performance_CV(data, **grid_lgb_smrus)
exec_time_lgb_smrus = time.time() - t_perf

In [63]:
exec_time_lgb_smrus

53.32814407348633

In [64]:
summary_lgb_smrus = get_summary_performances(performance_lgb_smrus, metrics)

In [65]:
summary_lgb_smrus

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/3/0.1/0.1/1,150/3/0.1/0.5/0.5,150/3/0.1/0.5/0.5
Validation performance,0.864+/-0.013,0.661+/-0.010,0.318+/-0.016
Test performance,0.863+/-0.017,0.637+/-0.019,0.300+/-0.013
Optimal parameter(s),150/3/0.1/0.1/0.5,150/3/0.1/0.1/0.5,150/3/0.1/0.1/0.5
Optimal test performance,0.864+/-0.015,0.646+/-0.027,0.303+/-0.017


##### LGB Weighted

In [66]:
clf_lgb_weight = LGBMClassifier()

params_lgb_weight = {f'clf__{i}': j for i, j in params_lgb.items()}
params_lgb_weight.update(
    {
        'clf__random_state':[seed],
        'clf__verbose': [-1],
        'clf__scale_pos_weight': [5, 10, 50, 1/ir]
    }
)

preprop_lgb_weight = [
    ('scaler', StandardScaler()),
]

grid_lgb_weight = grid_create(
    clf_lgb_weight, params_lgb_weight, preprop_lgb_weight,
    search_type='grid', key_params_idxs=[0,1,2,5]
)

In [67]:
t_perf = time.time()
performance_lgb_weight = model_performance_CV(data, **grid_lgb_weight)
exec_time_lgb_weight = time.time() - t_perf

In [68]:
exec_time_lgb_weight

50.84895706176758

In [69]:
summary_lgb_weight = get_summary_performances(performance_lgb_weight, metrics)

In [70]:
summary_lgb_weight

Unnamed: 0,AUC ROC,Average Precision,Card Precision@100
Best estimated parameters,150/3/0.1/5,150/3/0.1/5,150/3/0.1/5
Validation performance,0.860+/-0.008,0.629+/-0.029,0.317+/-0.016
Test performance,0.867+/-0.014,0.627+/-0.028,0.301+/-0.015
Optimal parameter(s),150/3/0.1/50,150/3/0.1/5,150/3/0.1/10
Optimal test performance,0.869+/-0.015,0.627+/-0.028,0.301+/-0.015


### Save Results

In [71]:
performance_dict = {
    'Logistic Regression RUS': performance_lr_rus,
    'Logistic Regression SMOTE+RUS': performance_lr_smrus,
    'Logistic Regression Weighted': performance_lr_weight,
    'Random Forest RUS': performance_rf_rus,
    'Random Forest SMOTE+RUS': performance_rf_smrus,
    'Random Forest Weighted': performance_rf_weight,
    'XGBoost RUS': performance_xgb_rus,
    'XGBoost SMOTE+RUS': performance_xgb_smrus,
    'XGBoost Weighted': performance_xgb_weight,
    'LightGBM RUS': performance_lgb_rus,
    'LightGBM SMOTE+RUS': performance_lgb_smrus,
    'LightGBM Weighted': performance_lgb_weight
}

In [72]:
filehandler = open('_performance_unbalanced.pkl', 'wb') 
pickle.dump(performance_dict, filehandler)
filehandler.close()

In [73]:
summary_dict = {
    'Logistic Regression RUS': summary_lr_rus,
    'Logistic Regression SMOTE+RUS': summary_lr_smrus,
    'Logistic Regression Weighted': summary_lr_weight,
    'Random Forest RUS': summary_rf_rus,
    'Random Forest SMOTE+RUS': summary_rf_smrus,
    'Random Forest Weighted': summary_rf_weight,
    'XGBoost RUS': summary_xgb_rus,
    'XGBoost SMOTE+RUS': summary_xgb_smrus,
    'XGBoost Weighted': summary_xgb_weight,
    'LightGBM RUS': summary_lgb_rus,
    'LightGBM SMOTE+RUS': summary_lgb_smrus,
    'LightGBM Weighted': summary_lgb_weight
}

In [74]:
filehandler = open('_summary_unbalanced.pkl', 'wb') 
pickle.dump(summary_dict, filehandler)
filehandler.close()

In [75]:
time_exec_dict = {
    'Logistic Regression RUS': exec_time_lr_rus,
    'Logistic Regression SMOTE+RUS': exec_time_lr_smrus,
    'Logistic Regression Weighted': exec_time_lr_weight,
    'Random Forest RUS': exec_time_rf_rus,
    'Random Forest SMOTE+RUS': exec_time_rf_smrus,
    'Random Forest Weighted': exec_time_rf_weight,
    'XGBoost RUS': exec_time_xgb_rus,
    'XGBoost SMOTE+RUS': exec_time_xgb_smrus,
    'XGBoost Weighted': exec_time_xgb_weight,
    'LightGBM RUS': exec_time_lgb_rus,
    'LightGBM SMOTE+RUS': exec_time_lgb_smrus,
    'LightGBM Weighted': exec_time_lgb_weight
}

In [76]:
filehandler = open('_time_exec_unbalanced.pkl', 'wb') 
pickle.dump(time_exec_dict, filehandler)
filehandler.close()

In [77]:
e_time = str(dt.timedelta(seconds=np.round(time.time() - t_script)))
print('Execution time: {}'.format(e_time))

Execution time: 0:47:48
