In [1]:
import time
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import (roc_auc_score, f1_score, roc_curve, auc, 
                             confusion_matrix, precision_recall_curve, make_scorer)


from tqdm import tqdm

In [11]:
from utils import utils

# Functions and configurations

In [3]:
RANDOM_STATE = 10

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Processing data

In [4]:
data_raw = pd.read_csv("dataset/processed_features.csv")


In [5]:
data_raw.corr()

Unnamed: 0,account_id,device_id,balance,processed_at,is_fraud,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
account_id,1.0,0.936605,-0.115446,0.030805,-0.031733,0.014728,-0.011119,-0.12493,-0.024468,0.314274,-0.312839,0.001893
device_id,0.936605,1.0,-0.120909,0.030237,-0.03227,0.010916,-0.010421,-0.128533,-0.021081,0.339309,-0.339272,0.001675
balance,-0.115446,-0.120909,1.0,0.013672,-0.002463,0.04107,-0.015854,0.278847,0.019471,0.036898,-0.043073,-0.000271
processed_at,0.030805,0.030237,0.013672,1.0,-0.006212,0.00517,-0.010542,0.034895,-0.000934,0.011392,-0.011319,-0.001043
is_fraud,-0.031733,-0.03227,-0.002463,-0.006212,1.0,0.001232,0.000773,-0.001799,0.005346,0.017931,-0.019762,-0.000132
age_range,0.014728,0.010916,0.04107,0.00517,0.001232,1.0,-0.008867,-0.025235,0.002038,0.043571,-0.044924,0.001572
number_of_selfies_sent,-0.011119,-0.010421,-0.015854,-0.010542,0.000773,-0.008867,1.0,0.134589,0.006826,-0.008444,0.006654,-0.000253
time_client,-0.12493,-0.128533,0.278847,0.034895,-0.001799,-0.025235,0.134589,1.0,0.014227,-0.022264,0.018509,0.014154
cash_out_type_1,-0.024468,-0.021081,0.019471,-0.000934,0.005346,0.002038,0.006826,0.014227,1.0,-0.200642,-0.079909,-0.000421
cash_out_type_2,0.314274,0.339309,0.036898,0.011392,0.017931,0.043571,-0.008444,-0.022264,-0.200642,1.0,-0.960473,-0.005063


In [6]:
# the columns that pass the float16 limite, we gonna take them and turn and something
# that we can work using min max scaler

max_account = data_raw["account_id"].max()
max_device_id = data_raw["device_id"].max()
max_processed_at = data_raw["processed_at"].max()
max_time_client = data_raw["time_client"].max()

data_raw["account_id"] = data_raw["account_id"] / max_account
data_raw["device_id"] = data_raw["device_id"] / max_device_id
data_raw["processed_at"] = data_raw["processed_at"] / max_processed_at
data_raw["time_client"] = data_raw["time_client"] / max_time_client

# seconds to days

data_raw.time_client = data_raw.time_client / (60*60*24)

data_raw.head()

Unnamed: 0,account_id,device_id,balance,processed_at,is_fraud,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
0,0.0,0.0,0.000167,0.999813,0,0.0,5.0,1.785904e-07,1,0,0,0
1,1.6e-05,0.0,0.000533,0.998152,0,1.0,5.0,1.97017e-06,0,0,1,0
2,1.6e-05,0.0,0.000347,0.999702,0,1.0,5.0,2.456546e-06,0,0,1,0
3,1.6e-05,0.0,0.000572,0.998524,0,1.0,5.0,2.087007e-06,0,0,1,0
4,1.6e-05,0.0,0.000346,0.998629,0,1.0,5.0,2.119767e-06,0,0,1,0


In [7]:
data = data_raw.copy()

data.drop("account_id", axis=1, inplace=True)

In [8]:
data.isna().sum()

device_id                 0
balance                   0
processed_at              0
is_fraud                  0
age_range                 0
number_of_selfies_sent    0
time_client               0
cash_out_type_1           0
cash_out_type_2           0
cash_out_type_3           0
cash_out_type_6           0
dtype: int64

In [9]:
Y = data.is_fraud.to_numpy()
X = data.copy().drop("is_fraud", axis = 1).to_numpy()

print("X shape:", X.shape)
print("Y shape:", Y.shape)

size_negatives, size_positives = list(np.bincount(Y))

all_pos_rate = size_positives / (size_positives + size_negatives)
all_neg_rate = size_negatives / (size_positives + size_negatives)

print(f"Proportion Positives {all_pos_rate}  / negatives {all_neg_rate}")

X shape: (940935, 10)
Y shape: (940935,)
Proportion Positives 0.0016313560447852402  / negatives 0.9983686439552147


In [10]:
#slice the dataset case testing models

positive_indicies, negative_indices = utils.split_indices_stritify(Y, 0.1, all_pos_rate)

X_2 = np.concatenate((X[negative_indices,:], X[positive_indicies,:]), axis=0)
X_2 = X_2.reshape(X_2.shape[0], -1)
Y_2 = np.concatenate((Y[negative_indices], Y[positive_indicies]))
Y_2 = Y_2.reshape(Y_2.shape[0])

size_negatives, size_positives = list(np.bincount(Y_2))

pos_rate = size_positives / (size_positives + size_negatives)
neg_rate = size_negatives / (size_positives + size_negatives)

print(f"X shape {X_2.shape} / Y shape {Y_2.shape}")
print(f"Proportion Positives {pos_rate}  / negatives {neg_rate}")


X_pack_sliced,  Y_pack_sliced = utils.split_data(X_2, Y_2, kfold=None)


NameError: name 'utils' is not defined

# GridSearch

In [None]:
#model, this cell take some time. on CPU with no parallelism
# pr_auc_score = make_scorer(pr_auc_score)

# params = {
#           "n_estimators": [400, 500],"max_depth":[None, 50],
#           "min_samples_split":[1,2],"min_samples_leaf": [1,2],
#           "oob_score": [True], "max_samples": [1,10,100]
# }


# rf_classifier = RandomForestClassifier(random_state=RANDOM_STATE, class_weight= "balanced_subsample", 
#                                        verbose=1, n_jobs=-1)

# clf = GridSearchCV(rf_classifier, params, scoring=pr_auc_score, 
#                    refit=False, return_train_score=True, verbose=2, n_jobs=-1)


In [None]:
# classifiers = train_model(clf, train_features, Y_train)


In [None]:
# classifiers.best_params_

In [None]:
# X = (train_features, val_features)
# Y = (Y_train, Y_val)
# history = compare_models(X, Y, classifiers)

In [None]:
# plot_cm(Y_val, history["clf_0"]["y_hat_val"], "RFC_tuned")

# Training model on all dataset

In [None]:
X_pack, Y_pack = utils.split_data(X, Y,test_size=0.01,kfold=None)

X_train, X_val, X_test = X_pack
Y_train, Y_val, Y_test = Y_pack

print("X_train shape", X_train.shape)
print("X_val shape", X_val.shape)
print("X_test shape", X_test.shape)
X_train[0:1]

In [None]:
baseline = RandomForestClassifier(max_depth= 50, max_features= 'log2', min_samples_split= 2, 
                                    n_estimators= 400, random_state=RANDOM_STATE, n_jobs= 3,
                                    class_weight= "balanced_subsample", min_samples_leaf= 2, 
                                    verbose=1, oob_score=True)

baseline = utils.train_model(classifier, X_train, Y_train)

In [None]:
X_train_val = (X_train, X_val)
Y_train_val = (Y_train, Y_val)
history = utils.compare_models(X_train_val, Y_train_val, baseline)

In [None]:
utils.plot_cm(Y_val, history["clf_0"]["y_hat_val"], "RFC")

In [None]:
utils.plot_roc_auc_curve(history)

In [None]:
utils.plot_pr_curve(history)

# Testing

In [None]:
X_test_pack = (None, X_test)
Y_test_pack = (None, Y_test)

history_test = utils.compare_models(X_test_pack, Y_test_pack, baseline)

In [None]:
utils.plot_cm(Y_test, history_test["clf_0"]["y_hat_test"], "Test")

# K-fold cross-validation

In [None]:
indices_split, (X_test, Y_test) = utils.split_data(X, Y,test_size=0.05,kfold=10)

for key, value in indices_split.itens():
    
    print(f"{key} size train: {len(value[0])}, val: {len(value[1])}")

In [None]:
clf = RandomForestClassifier(max_depth= 50, max_features= 'log2', min_samples_split= 2, 
                                    n_estimators= 400, random_state=RANDOM_STATE, n_jobs= 3,
                                    class_weight= "balanced_subsample", min_samples_leaf= 2, 
                                    verbose=1, oob_score=True)

clfs = utils.train_model(clf, X, Y, indices_split)

In [None]:
history = utils.compare_models(X, Y, clfs, indices_split)

In [None]:
utils.plot_pr_curve(history)

In [None]:
from joblib import dump

dump(baseline, 'baseline_rfc.joblib') 