<a href="https://colab.research.google.com/github/alessandrotofani/Tesi_magistrale/blob/master/5_Dinamic_model_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.colab import drive
drive.mount('/content/drive')
from sklearn.model_selection import train_test_split

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys 
sys.path.append('/content/drive/MyDrive/Tesi_magistrale/Tesi_magistrale')
import mf

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Tesi_magistrale/Dataset/IEEE/Output/data.csv')
data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

# Feature engineering and scaling

In [None]:
data = mf.feature_engineering(data)
data = mf.feature_scaling(data)
data = pd.get_dummies(data)

# Dinamic model

In [None]:
init_splits = 4
incoming_splits = 2
tot_splits = 10

In [None]:
def get_split_id(data, init_splits, incoming_splits, tot_splits):
  id = {} # dizionario con (split : id_righe)
  id_fraud = {} # dizionario con (split: id righe transazioni fraudolente)
  for i in range(tot_splits):
    id[i] = data[(data['TransactionDT']>=(i/tot_splits)) & (data['TransactionDT']<((i + 1)/tot_splits))].index.tolist()
    id_fraud[i] = data[(data['TransactionDT']>=(i/tot_splits)) & (data['TransactionDT']<((i + 1)/tot_splits)) & (data['isFraud'] == 1)].index.tolist()
    
  return id, id_fraud

In [None]:
class model:
  name = 'XGBoost'

  def __init__(self, t):
    self.t = t
    self.f1 = 0
    self.precision = 0
    self.recall = 0
    self.roc = 0
    self.proba = []
    self.prediction = []
    self.train_id = [] # lista con le righe su cui il modello si è allenato
    self.val_id = [] # lista con le righe su cui il modello viene validato
    # model_dict[t] = self # dizionario che contiene (tempo: modello t-esimo)
    return

  def __str__(self):
    return f"{name}, at time {self.t}, with f1 score of {self.f1}"

  def init_xgb(self, estimators = 100, depth = 6): # inizializza xgb
    import xgboost 
    from xgboost import XGBClassifier
    self.xgb = XGBClassifier(n_estimators = estimators, max_depth = depth, 
                      objective='binary:logistic', learning_rate = 0.5, 
                      tree_method='gpu_hist')
    return f'{name} at time {self.t}, has been initialised.'
  
  def train_xgb(self, X_train, y_train): # traina il modello
    self.xgb.fit(X_train.to_numpy(), y_train.to_numpy())
    return f'{name} is now trained!'

  def get_set(self, data, id_list): # restituisce il subset dati gli id delle righe
    # id_list: è l'elemento id[t] 
    subset = data.iloc[id_list,:]
    return subset

  def proba_predictions(self, X_val): # restituisce le probabilità
    if not xgb:
      return f'{name} is not declared. You need to use init_xgb'
    self.proba = self.xgb.predict_proba(X_val, validate_features = False)
    return f'Proba acquired.'

  def aggregate(self, soglia = 0.05): # resituisce la classe
    if not self.proba:
      return f'self.proba is empty. Before aggregate, you need to use proba_predictions'
    for proba in self.proba[:,1]:
      if proba > soglia:
        self.prediction.append(1)
      else:
        self.prediction.append(0)
    return f'Classification has been councluded!'

  def performance(self, soglia = 0.05):
    from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
    mf.plot_cm(y_val, self.pred, 'Blues', p = soglia, dynamic = True)
    print('\n')
    print('Ensemble confusion matrix')

    self.score = f1_score(y_val, self.pred).round(2)
    self.roc = roc_auc_score(y_val, self.pred).round(2)
    self.precision = precision_score(y_val, self.pred,  average='binary').round(2)
    self.recall = recall_score(y_val, self.pred, average='binary').round(2)

    print('\n')
    print('Model f1 score: ', self.score)
    print('Model roc auc score: ', self.roc)
    print('Modelrecall: ', self.recall)
    print('Model precision: ', self.precision)
    print('\n')
    return 

In [None]:
class ensemble(model): # child class di model
  name = 'Ensemble'
  # come model_dict passare il dizionario in cui si salvano gli ensemble

  @override 
  def proba_predictions(self, X_val, model_dict):
    proba = []
    for t in model_dict:
      proba.append(model_dict[t].predict_proba(X_val, validate_features = False)[:, 1])
    self.proba = np.mean(proba, axis = 0)
    return f'Proba acquired.'
  
  @override
  def aggregate(self, soglia = 0.05):
    for proba in self.proba:
      if proba > soglia:
        self.prediction.append(1)
      else:
        self.prediction.append(0)
    return f'Classification has been councluded!'

In [None]:
def get_t_id(data, id, t, init_splits, incoming_splits):
  id_list = [] # lista che conterrà le righe da usare al tempo t
  if t == 0:
    start = 0
    end = init_splits
  else:
    start = incoming_splits * (t - 1) + init_splits
    end = start + incoming_splits

  selected_id = id[start:end]
  for n in selected_id:
    id_list += selected_id[n]

  return id_list

In [None]:
def train_test_model(sel_model, data, id_t):
  # Inizializzazione modello
  sel_model.init_xgb() # inizializzo xgb

  # Train test split
  subset = model.get_set(data, id_t) # seleziono il subset su cui il modello si allenerà
  X_train, X_val, y_train, y_val = mf.split(subset, test_size = 0.2) # train test split
  sel_model.train_id = X_train.index.tolist() # salvo gli id del training set 
  sel_model.val_id = X_val.index.tolist() # salvo gli id del test set

  # Training e valutazione delle performance
  soglia = 0.05
  sel_model.train_xgb(X_train, y_train) # train di xgb 
  sel_model.proba_predictions(X_val) # predict_proba
  sel.model.aggregate(soglia = soglia) # predict
  sel.model.performance(soglia = soglia) # valuta le performance del modello

  return f'Training and testing concluded. \n'

In [None]:
def dinamic_model(data, id, id_fraud, init_splits, incoming_splits, tot_splits):
  tmax = np.ceil((tot_splits) / incoming_splits)
  model_dict = {}
  ensemble_dict = {}

  for t in range(int(tmax - 1)):
    id_t = get_t_id(data, id, t, init_splits, incoming_splits)
    model_dict[t] = model(t)
    if t > 0:
      ensemble_dict[t] = ensemble(t)
    train_test_model(model_dict[t], data, id_t)



  return model_dict, ensemble_dict

In [None]:
model_dict, ensemble_dict = dinamic_model(data, id, id_fraud, init_splits, incoming_splits, tot_splits)