# Detección de anomalías en un marco de auditoría continua

## 9. Datos desbalanceados - Modelos no supervisados - PyCaret

### 9.1. Split del dataset

Split del dataset para reservar un conjunto de datos para validación posterior al entrenamiento y desarrollo de los modelos.

In [1]:
# Importación de librerías
import pandas as pd
import numpy as np
import os
import time
import joblib

# ! pip install pandas
# ! pip install numpy
# ! pip install pycaret==2.3.5
# ! pip install scipy==1.4.1
# ! pip install joblib
# ! pip install sklearn
# ! pip install hyperopt

In [2]:
# Filtro de warnings.
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Carga del dataset.
os.chdir('..')
df = pd.read_csv('df_preprocessed.csv')
df.head()

Unnamed: 0,day,hour,amount,idOrig,oldbalanceOrig,newbalanceOrig,idDest,oldbalanceDest,newbalanceDest,Cash_in,Cash_out,Debit,Payment,Transfer,isFraud,isFlaggedFraud
0,1,1,9839.64,0,170136.0,160296.36,6353307,0.0,0.0,0,0,0,1,0,0,0
1,1,1,1864.28,1,21249.0,19384.72,6353308,0.0,0.0,0,0,0,1,0,0,0
2,1,1,181.0,2,181.0,0.0,6353309,0.0,0.0,0,0,0,0,1,1,0
3,1,1,181.0,3,181.0,0.0,6353310,21182.0,0.0,0,1,0,0,0,1,0
4,1,1,11668.14,4,41554.0,29885.86,6353311,0.0,0.0,0,0,0,1,0,0,0


In [4]:
# Split del dataset para reservar un conjunto de validación.
from sklearn.model_selection import train_test_split
df_training, df_validation = train_test_split(df, test_size = 0.2, random_state = 42, shuffle = True, stratify = np.array(df['isFraud']))

In [5]:
# Control de frecuencias de clases.
print('Proporcion de clases para dataset de training: %.4f' % (df_training['isFraud'].value_counts()[1] / df_training['isFraud'].value_counts()[0]))
print('Proporcion de clases para dataset de validación: %.4f' % (df_validation['isFraud'].value_counts()[1] / df_validation['isFraud'].value_counts()[0]))

Proporcion de clases para dataset de training: 0.0013
Proporcion de clases para dataset de validación: 0.0013


### 9.2. Detección de anomalías utilizando la librería PyCaret

https://towardsdatascience.com/unsupervised-anomaly-detection-in-python-f2e61be17c2b

https://www.oreilly.com/library/view/hands-on-unsupervised-learning/9781492035633/ch04.html

https://pycaret.org/

In [21]:
# Importación de librerías.
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, accuracy_score
from hyperopt import fmin, hp, tpe, space_eval, Trials
from pycaret.anomaly import *

# Definición de la función objetivo.
def objective_func(search_space):

    global counter, scorings, df_subset
    
    start_time = time.time()
    counter += 1    
    
    kfold = StratifiedKFold(n_splits = 4, shuffle = True, random_state = 42)
    
    scores = {'accuracy' : [],
              'recall' : [],
              'precision' : [],
              'f1_score' : [],
              'roc_auc' : []
             }
    
    scorings_trial = {}

    for train, test in kfold.split(df_subset, y = df_subset.iloc[:, -2]):

        anom = setup(data = df_subset.iloc[train, :],
                     ignore_features = ['isFraud', 'isFlaggedFraud'], 
                     categorical_features = ['Cash_in', 'Cash_out', 'Debit', 'Payment', 'Transfer'],
                     normalize = True,
                     normalize_method = search_space['scaler'],
                     silent = True,
                     use_gpu = True,
                     verbose = False,
                     session_id = 42, 
                    )
        
        iforest = create_model(model = search_space['model'], fraction = search_space['fraction'], verbose = False)
        
        predictions = predict_model(iforest, data = df_subset.iloc[test, :])
        
        y_true = df_subset.iloc[test, -2]
        y_pred = predictions.loc[:, 'Anomaly']
        
        scores['accuracy'].append(accuracy_score(y_true = y_true, y_pred = y_pred))
        scores['recall'].append(recall_score(y_true = y_true, y_pred = y_pred))
        scores['precision'].append(precision_score(y_true = y_true, y_pred = y_pred))
        scores['f1_score'].append(f1_score(y_true = y_true, y_pred = y_pred))
        scores['roc_auc'].append(roc_auc_score(y_true = y_true, y_score = y_pred))
    
    scorings_trial['trial'] = counter
    scorings_trial['search_space'] = str(search_space)

    for score in scores:
        scorings_trial[score + '_mean'] = np.mean(scores[score])
        scorings_trial[score + '_std'] = np.std(scores[score])

    loss = 1 - scorings_trial['f1_score_mean'] + scorings_trial['f1_score_std']

    scorings.append(scorings_trial)
    
    elapsed_time = time.time() - start_time
    
    print('Trial: %i | Loss: %.4f | Elapsed_time: %.4f seconds' % (counter, loss, elapsed_time))
    
    return loss

# Función para la impresión de resultados.
def hyperopt_printer(trials):
    print('\nBest Trial:')
    print('Trial ID :', trials.best_trial['tid'])
    print('Loss :', trials.best_trial['result']['loss'])
    print('Params :', space_eval(search_space, best_params), end = '\n\n')

# Función para exportación de resultados a formato JSON.
def json_export(scorings):
    try:
        os.mkdir('hyperopt_optimizations')
    except:
        None

    now = datetime.now()
    dt_string = now.strftime("%Y%m%d_%H%M%S")
    path = os.getcwd() + '\hyperopt_optimizations\\'
    filename = 'Opt_' + dt_string + '.json'

    f = open(path + filename, 'w')
    json.dump(scorings, f)
    f.close()
    print('Json file: ' + dt_string + '.json', end = '\n\n')

# Función para la obtención de resultados en formato tabla para visualizar scorings y desvios estándar.
def scorings_to_df(scorings):
    scorings_df = pd.DataFrame(columns = list(scorings[0].keys()))
    for scoring in scorings:
        scorings_df = scorings_df.append(scoring, ignore_index = True)

    print('Scoring details:', end = '\n\n')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):  # more options can be specified also
        print(scorings_df)

    return scorings_df

In [22]:
# Importación de librerías.
from sklearn.model_selection import StratifiedShuffleSplit

# Espacio de búsqueda.
sss = StratifiedShuffleSplit(n_splits = 1, random_state = 42, train_size = 0.005)
for a, b in sss.split(df_training, df_training.iloc[:, -2]):
    df_subset = df_training.iloc[a, :]
scaler = ['zscore', 'minmax', 'maxabs', 'robust']
models = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sos']
search_space = {'scaler' : hp.choice('scaler', scaler),
                'model' : hp.choice('model', models),
                'fraction' : hp.randint('fraction', 1 , 50) / 100
               }

# Ejecución del optimizador.
counter = -1
scorings = []
rstate = np.random.default_rng(42)
trials = Trials() # Para el logging de resultados.
best_params = fmin(fn = objective_func, space = search_space, algo = tpe.suggest, max_evals = 100, trials = trials, rstate = rstate)

Trial: 0 | Loss: 0.9933 | Elapsed_time: 7.2783 seconds 
Trial: 1 | Loss: 0.9882 | Elapsed_time: 15.3439 seconds                          
Trial: 2 | Loss: 0.9908 | Elapsed_time: 8.0661 seconds                           
Trial: 3 | Loss: 0.9900 | Elapsed_time: 6.2181 seconds                           
Trial: 4 | Loss: 0.9914 | Elapsed_time: 15.4294 seconds                          
Trial: 5 | Loss: 0.9892 | Elapsed_time: 6.2649 seconds                           
Trial: 6 | Loss: 0.9805 | Elapsed_time: 81.2892 seconds                          
Trial: 7 | Loss: 0.9942 | Elapsed_time: 34.3305 seconds                          
Trial: 8 | Loss: 0.9918 | Elapsed_time: 76.9776 seconds                          
Trial: 9 | Loss: 0.9906 | Elapsed_time: 15.3473 seconds                            
Trial: 10 | Loss: 0.9931 | Elapsed_time: 33.0644 seconds                           
Trial: 11 | Loss: 0.9845 | Elapsed_time: 6.3481 seconds                           
Trial: 12 | Loss: 0.9882 | Elapsed_ti

In [23]:
# Importación de librerías.
import os
import json
from datetime import datetime

hyperopt_printer(trials)
json_export(scorings)
scorings_df = scorings_to_df(scorings)


Best Trial:
Trial ID : 29
Loss : 0.9660247181470806
Params : {'fraction': 0.04, 'model': 'knn', 'scaler': 'minmax'}

Json file: 20220423_215935.json

Scoring details:

   trial                                       search_space accuracy_mean accuracy_std recall_mean recall_std precision_mean precision_std f1_score_mean f1_score_std roc_auc_mean roc_auc_std
0      0  {'fraction': 0.32, 'model': 'pca', 'scaler': '...      0.680707     0.004416    0.881944   0.078874       0.003557       0.00019      0.007085     0.000379     0.781197    0.038692
1      1  {'fraction': 0.17, 'model': 'iforest', 'scaler...       0.83108     0.008489    0.850694   0.097902       0.006473      0.000513      0.012848     0.001019     0.840876    0.046722
2      2  {'fraction': 0.15, 'model': 'cluster', 'scaler...      0.850884     0.007219    0.604167     0.0625       0.005248      0.000616      0.010405     0.001218     0.727684    0.028158
3      3  {'fraction': 0.07, 'model': 'histogram', 'scal...      0.

In [4]:
# Espacio de búsqueda.
df_subset = df_training.iloc[0:100000,:]
models = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
search_space = {'scaler' : hp.choice('scaler', ['zscore', 'minmax', 'maxabs', 'robust']),
                'model' : hp.choice('model', models),
                'fraction' : hp.randint('fraction', 1 , 100) / 100}

# Ejecución del optimizador.
counter = -1
scorings = []
rstate = np.random.default_rng(42)
trials = Trials() # Para el logging de resultados.
best_params = fmin(fn = objective_func, space = search_space, algo = tpe.suggest, max_evals = 100, trials = trials, rstate = rstate)

Trial: 0 | Loss: 0.9960 | Elapsed_time: 18.5950 seconds
Trial: 1 | Loss: 0.9936 | Elapsed_time: 16.8268 seconds                          
Trial: 2 | Loss: 0.9927 | Elapsed_time: 17.8957 seconds                          
Trial: 3 | Loss: 0.9879 | Elapsed_time: 17.4905 seconds                          
Trial: 4 | Loss: 0.9955 | Elapsed_time: 17.2743 seconds                          
Trial: 5 | Loss: 0.9940 | Elapsed_time: 16.5974 seconds                          
Trial: 6 | Loss: 0.9767 | Elapsed_time: 16.6526 seconds                          
Trial: 7 | Loss: 0.9966 | Elapsed_time: 16.8212 seconds                          
Trial: 8 | Loss: 0.9934 | Elapsed_time: 17.0364 seconds                          
Trial: 9 | Loss: 0.9950 | Elapsed_time: 16.8877 seconds                          
Trial: 10 | Loss: 0.9956 | Elapsed_time: 17.2701 seconds                          
Trial: 11 | Loss: 0.9888 | Elapsed_time: 17.4749 seconds                          
Trial: 12 | Loss: 0.9933 | Elapsed_time:

In [7]:
import os
import json
from datetime import datetime
from hyperopt import fmin, hp, tpe, space_eval, Trials

# Función para la impresión de resultados.
def hyperopt_printer(trials):
    print('\nBest Trial:')
    print('Trial ID :', trials.best_trial['tid'])
    print('Loss :', trials.best_trial['result']['loss'])
    print('Params :', space_eval(search_space, best_params), end = '\n\n')

# Función para exportación de resultados a formato JSON.
def json_export(scorings):
    try:
        os.mkdir('HyperOpt Optimizations')
    except:
        None

    now = datetime.now()
    dt_string = now.strftime("%Y%m%d_%H%M%S")
    path = os.getcwd() + '\\HyperOpt Optimizations\\'
    filename = 'Opt_' + dt_string + '.json'

    f = open(path + filename, 'w')
    json.dump(scorings, f)
    f.close()
    print('Json file: ' + dt_string + '.json', end = '\n\n')

# Función para la obtención de resultados en formato tabla para visualizar scorings y desvios estándar.
def scorings_to_df(scorings):
    scorings_df = pd.DataFrame(columns = list(scorings[0]['scorings'].keys()))
    for scoring in scorings:
        scorings_df = scorings_df.append(scoring['scorings'], ignore_index = True)

    print('Scoring details:', end = '\n\n')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):  # more options can be specified also
        print(scorings_df)

In [8]:
hyperopt_printer(trials)
json_export(scorings)
scorings_to_df(scorings)


Best Trial:
Trial ID : 51
Loss : 0.9583333333333333
Params : {'fraction': 0.02, 'model': 'histogram', 'scaler': 'robust'}

Json file: 20220309_011343.json



KeyError: 'scorings'

In [None]:
# plot_model(anom_model, plot = 'tsne')
# plot_model(anom_model, plot = 'umap')

In [None]:
df_test = df[100000:200000]
predictions = predict_model(iforest, data = df_test)
predictions.head()

In [None]:
save_model(model = iforest, model_name = 'iforest_model')

In [None]:
loaded_model = load_model('iforest_model')
type(loaded_model)

In [None]:
df_test = df[100000:200000]
loaded_model.predict(df_test)

In [None]:
loaded_model.predict_proba(df_test)

In [None]:
loaded_model.decision_function(df_test)