<a href="https://colab.research.google.com/github/alessandrotofani/Tesi_magistrale/blob/master/5_Dinamic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.colab import drive
drive.mount('/content/drive')
from sklearn.model_selection import train_test_split

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys 
sys.path.append('/content/drive/MyDrive/Tesi_magistrale/Tesi_magistrale')
import mf

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Tesi_magistrale/Dataset/IEEE/Output/data.csv')
data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

# Feature engineering and scaling

In [4]:
data = mf.feature_engineering(data)
data = mf.feature_scaling(data)
data = pd.get_dummies(data)

# Fraud distribution analysis

In [5]:
initial_splits = 2
splits = 1
tot_splits = 10   

def split_data(data, tot_splits, verbose = False):
  n_data = {}
  dati_splitted = []
  dati_fraud = []

  for i in range(tot_splits):
    dati_splitted.append(data[(data['TransactionDT']>=(i/tot_splits)) & (data['TransactionDT']<((i + 1)/tot_splits))]) 
    dati_fraud.append(dati_splitted[i][dati_splitted[i]['isFraud'] == 1])
    if verbose: 
      dati_per_split = dati_splitted[i].count()[0]  
      fraud_per_split = dati_fraud[i].count()[0]  
      density = fraud_per_split / dati_per_split
      n_data[i] = [dati_per_split, fraud_per_split, density.round(3)]
  if verbose:
    print(n_data)
    print(dati_splitted)
    print(dati_fraud)

  return dati_splitted, dati_fraud

dati_splitted, dati_fraud = split_data(data, tot_splits, verbose = False)

In [6]:
class ensemble:
  def __init__(self):
    self.score = 0
    self.pred = []
    return 

  def predictions(self, X_val, xgb, soglia = 0.05):
    proba = []
    y_mean = []
    y_pred = []
    for t in xgb:
      ''' if model_t is ensemble: predict
          else: predict_proba '''
      proba.append(xgb[t].predict_proba(X_val, validate_features = False)[:, 1]) 
 

    y_mean = np.mean(proba, axis = 0)
    for p in y_mean:
      if p > soglia:
        y_pred.append(1)
      else:
        y_pred.append(0)
    # print(y_pred_t)
    return y_pred

  def voting(self, X_val, y_val, xgb):
    from sklearn.metrics import roc_auc_score
    from sklearn.metrics import f1_score

    self.pred = ensemble.predictions(self, X_val, xgb)
    print(self.pred)
    self.score = f1_score(y_val, self.pred)
    # self.score = roc_auc_score(y_val, self.pred)
    return self.pred , self.score


In [7]:
def predictions(model, X_val, soglia = 0.05):
  y_pred = []
  proba = model.predict_proba(X_val, validate_features = False)[:, 1]
  for p in proba:
    if p > soglia:
      y_pred.append(1)
    else:
      y_pred.append(0)
  return y_pred

In [9]:
def train_and_test_model(data, data_fraud, model_t, xgb, ens, score_t, t, keep_fraud = False):
  import xgboost 
  from xgboost import XGBClassifier
  from sklearn.metrics import roc_auc_score
  from sklearn.metrics import f1_score
  print('entered')
  X_train, X_val, y_train, y_val = mf.split(data, test_size = 0.2)
  if t > 0 and keep_fraud == True:
    y = data_fraud['isFraud']
    X = data_fraud.drop(['isFraud'], axis = 1)
    X_train = pd.concat([X_train, X], axis=0)
    y_train = pd.concat([y_train, y], axis=0)
  print('done')
  xgb[t] = XGBClassifier(n_estimators = 100, max_depth = 2, 
                      objective='binary:logistic', learning_rate = 0.5, 
                      tree_method='gpu_hist')
  xgb[t].fit(X_train.to_numpy(), y_train.to_numpy())
  
  # new_score = roc_auc_score(y_val, model_t[t].predict_proba(X_val, validate_features = False)[:, 1])

  y_pred = predictions(xgb[t], X_val)
  new_score = f1_score(y_val, y_pred)


  if t > 0:
    old_score = score_t[t-1]
    ens.voting(X_val, y_val, xgb)

    score_dict = {
        'new': new_score,
        'old': old_score,
        'ensemble': ens.score
    }

    print('Scores \n', score_dict)

    import operator
    best_model = max(score_dict.items(), key=operator.itemgetter(1))[0]
    best_score = score_dict[best_model]
    if best_model == 'new':
      return xgb[t], best_score, xgb[t]
    if best_model == 'old':
      return model_t[t-1], best_score, xgb[t-1]
    if best_model == 'ensemble':
      return ens, ens.score, xgb[t]
  else:
    return xgb[t], new_score, xgb[t]

In [11]:
def dinamic_model(initial_splits, splits, tot_splits, data, data_fraud):
  tmax = np.ceil((tot_splits) / splits) #  massimo numero di iterazioni possibili 
  data_t = {} # contiene i dati dello split 
  data_fraud_t = {} # contiene le transazioni fraudolente 
  model_t = {} # contiene il modello in uso al tempo t
  xgb = {} # contiene gli xgb trainati solo sullo split t 
  score_t = {} # contiene lo score dei modelli 
  ens = ensemble() # ensemble dei modelli 

  for t in range(int(tmax - 1)):
    if t == 0:
      start = 0
      end = initial_splits
      data_fraud_t[t] = None
    else:
      start = splits * (t - 1) + initial_splits
      end = start + splits
      start_fraud = splits * (t - 2) + initial_splits
      end_fraud = start + splits
      data_fraud_t[t] = pd.concat(data_fraud[start_fraud:end_fraud])
      # print(data_fraud_t[t])
    data_t[t] = pd.concat(dati_splitted[start:end])
    # if t > 0:
    #   data_list = [data_t[t]] # lista con i dati da da usare per il train 
    #   for i in range(t):
    #     data_list.append(data_fraud[i]) # aggiungo le transazioni fraudolente precedenti
    #   data_t[t] = pd.concat(data_list) 
    print(t)
    model_t[t], score_t[t], xgb[t] = train_and_test_model(data_t[t], data_fraud_t[t], model_t, xgb, ens, score_t, t, keep_fraud = False) # ricavo il modello più performante
    print('Score: ', score_t[t]) 
  return model_t[t], score_t[t], xgb[t]

model, score, xgb = dinamic_model(initial_splits, splits, tot_splits, data, dati_fraud)
print('Score: ', score)
print('Model: ', model)

0
entered
done
Score:  0.33323067446874044
1
entered
done
[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [None]:
# X_train, X_val, y_train, y_val = mf.split(pd.concat(dati_splitted[:initial_splits - 1]), test_size = 0.2)
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=42)