In [None]:

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import load_model


In [None]:
from utils import utils

In [None]:
RANDOM_STATE = 35

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Preprocessing

In [None]:
data_ts = pd.read_csv("dataset/processed_features.csv")


columns=['account_id', 'is_fraud', 'device_id', 'balance', 'processed_at',
       'age_range', 'number_of_selfies_sent', 'time_client', 'cash_out_type_1',
       'cash_out_type_2', 'cash_out_type_3', 'cash_out_type_6']

data_ts = data_ts[columns]

# the columns that pass the float16 limite, we gonna take them and turn and something
# that we can work using min max scaler

max_account = data_ts["account_id"].max()
max_device_id = data_ts["device_id"].max()
max_processed_at = data_ts["processed_at"].max()
max_time_client = data_ts["time_client"].max()

data_ts["account_id"] = data_ts["account_id"] / max_account
data_ts["device_id"] = data_ts["device_id"] / max_device_id
data_ts["processed_at"] = data_ts["processed_at"] / max_processed_at
data_ts["time_client"] = data_ts["time_client"] / max_time_client

# seconds to days

data_ts.time_client = data_ts.time_client / (60*60*24)

data_ts.head()

In [None]:
#droping the target and the features weakly correlated to the target
data_to_encode = data_ts.copy().drop(["account_id", "processed_at", "number_of_selfies_sent", "cash_out_type_6"], axis=1)

Y = np.array(data_to_encode.pop("is_fraud"), dtype=np.float32)
X = np.array(data_to_encode, dtype=np.float32)

# Model

In [None]:
encoder = load_model("../api_fraud_detection/model/saved_models/bottleneck")

In [None]:
train, test = train_test_split(data_ts, test_size = 0.01,  stratify=data_ts.is_fraud, random_state=RANDOM_STATE)
train, val = train_test_split(train, test_size = 0.01, stratify=train.is_fraud, random_state=RANDOM_STATE)

test.to_csv("test_df.csv", index=False)

train.drop(["account_id", "processed_at","number_of_selfies_sent",
            "cash_out_type_6"], inplace=True, axis=1)
val.drop(["account_id", "processed_at","number_of_selfies_sent",
            "cash_out_type_6"], inplace=True, axis=1)
test.drop(["account_id", "processed_at","number_of_selfies_sent",
            "cash_out_type_6"], inplace=True, axis=1)

labels_test = np.array(test.pop("is_fraud")) 
labels_train = np.array(train.pop("is_fraud"))
labels_val = np.array(val.pop("is_fraud"))


features_train = encoder.predict(np.array(train))
features_val = encoder.predict(np.array(val))
features_test = encoder.predict(np.array(test))

clf = RandomForestClassifier(max_depth= 50, max_features= 'log2', min_samples_split= 2, 
                            n_estimators= 400, random_state=RANDOM_STATE, n_jobs= 3,
                            class_weight= "balanced_subsample", min_samples_leaf= 2, 
                            verbose=1, oob_score=True)

clf.fit(features_train, labels_train)

In [None]:
X_pack = features_train, features_val
Y_pack = labels_train, labels_val

history = utils.compare_models(X_pack, Y_pack, clf)

In [None]:
plt.figure()
utils.plot_pr_curve(history)
plt.show

# Testing

In [None]:
X_pack_test = None, features_test
Y_pack_test = None, labels_test

history_test = utils.compare_models(X_pack_test, Y_pack_test, clf)

In [None]:
utils.plot_cm(labels_test, history_test["clf_0"]["y_hat_test"], "Final Model")

# K-fold cross-val

In [None]:
encoded_x = np.concatenate((features_train, features_val), axis=0)
Y = np.concatenate((labels_train, labels_val), axis=0)

indices_split, (X_test, Y_test) = utils.split_data(encoded_x, Y,test_size=0.01,kfold=10,
                                                                    random_state=RANDOM_STATE)

for key, value in indices_split.items():
    
    print(f"{key} size train: {len(value[0])}, val: {len(value[1])}")

In [None]:
clfs = utils.train_model(encoded_x, Y, indices_split=indices_split)

In [None]:
history_cross = utils.compare_models(encoded_x, Y, clfs, indices_split)

In [None]:
utils.plot_pr_curve(history_cross)

In [None]:
from joblib import dump

dump(clf, 'model_rfc.joblib') 