# Detección de anomalías en un marco de auditoría continua

## 6.3. Modelos no supervisados

### 6.3.3. Detección de anomalías utilizando la Ley de Benford

https://towardsdatascience.com/frawd-detection-using-benfords-law-python-code-9db8db474cf8

Split del dataset para reservar un conjunto de datos para validación posterior al entrenamiento y desarrollo de los modelos.

In [3]:
# Importación de librerías
import pandas as pd
import numpy as np
import sys
import math
import matplotlib.pyplot as plt

# Carga del dataset.
df = pd.read_csv('df_preprocessed.csv')

# Split del dataset para reservar un conjunto de validación.
from sklearn.model_selection import train_test_split
df_training, df_validation = train_test_split(df, test_size = 0.2, random_state = 42, shuffle = True, stratify = np.array(df['isFraud']))

# Control de frecuencias de clases.
print('Proporcion de clases para dataset de training: %.4f' % (df_training['isFraud'].value_counts()[1] / df_training['isFraud'].value_counts()[0]))
print('Proporcion de clases para dataset de validación: %.4f' % (df_validation['isFraud'].value_counts()[1] / df_validation['isFraud'].value_counts()[0]))

Proporcion de clases para dataset de training: 0.0013
Proporcion de clases para dataset de validación: 0.0013


In [24]:
# Configuración de número de decimales.
pd.options.display.float_format = '{:.2f}'.format

# Analisis exploratorio de datos.      
with pd.option_context('display.max_rows', None, 'display.max_columns', 10, 'display.width', 1000):
    print(df.describe().transpose())
print()
print(df.info())
print()
print(df.isnull().sum())

                    count       mean        std    min        25%        50%        75%          max
day            6362620.00      10.49       5.92   1.00       7.00      10.00      14.00        31.00
hour           6362620.00       8.37       4.56   1.00       6.00       8.00      11.00        24.00
amount         6362620.00  179861.90  603858.23   0.00   13389.57   74871.94  208721.48  92445516.64
idOrig         6362620.00 3175068.41 1834059.22   0.00 1586504.75 3174301.50 4763206.25   6353306.00
oldbalanceOrig 6362620.00  833883.10 2888242.67   0.00       0.00   14208.00  107315.18  59585040.37
newbalanceOrig 6362620.00  855113.67 2924048.50   0.00       0.00       0.00  144258.41  49585040.37
idDest         6362620.00 7275614.81  795239.09 571.00 6611227.00 7083926.00 7842574.25   9073899.00
oldbalanceDest 6362620.00 1100701.67 3399180.11   0.00       0.00  132705.66  943036.71 356015889.35
newbalanceDest 6362620.00 1224996.40 3674128.94   0.00       0.00  214661.44 1111909.25 356

In [42]:
# Función para el cálculo de frecuencias relativas.
def count_first_digit(data_str):
    mask=df[data_str]>1.
    data=list(df[mask][data_str])
    for i in range(len(data)):
        while data[i]>10:
            data[i]=data[i]/10
    first_digits=[int(x) for x in sorted(data)]
    unique=(set(first_digits))#a list with unique values of     first_digit list
    data_count=[]
    for i in unique:
        count=first_digits.count(i)
        data_count.append(count)
    total_count=sum(data_count)
    data_percentage=[(i/total_count)*100 for i in data_count]
    return  total_count,data_count, data_percentage

In [46]:
# Benford's Law percentages for leading digits 1-9
BENFORD = [30.1, 17.6, 12.5, 9.7, 7.9, 6.7, 5.8, 5.1, 4.6]

In [47]:
def get_expected_counts(total_count):
    """Return list of expected Benford's Law counts for total sample count."""
    return [round(p * total_count / 100) for p in BENFORD]
expected_counts=get_expected_counts(total_count)

In [None]:
expected_counts

[1915109, 1119798, 795311, 617161, 502636, 426287, 369024, 324487, 292674]

In [49]:
def chi_square_test(data_count,expected_counts):
    """Return boolean on chi-square test (8 degrees of freedom & P-val=0.05)."""
    chi_square_stat = 0  # chi square test statistic
    for data, expected in zip(data_count,expected_counts):

        chi_square = math.pow(data - expected, 2)

        chi_square_stat += chi_square / expected

    print("\nChi-squared Test Statistic = {:.3f}".format(chi_square_stat))
    print("Critical value at a P-value of 0.05 is 15.51.")    
    return chi_square_stat < 15.51

chi_square_test(data_count,expected_counts)


Chi-squared Test Statistic = 126770.925
Critical value at a P-value of 0.05 is 15.51.


False

Ejecución del optimizador

In [None]:
# ! pip install hyperopt

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, accuracy_score
from hyperopt import fmin, hp, tpe, space_eval, Trials

kfold = StratifiedKFold(n_splits = 2, shuffle = True, random_state = 42)
scores = {'accuracy' : [],
          'recall' : [],
          'precision' : [],
          'f1_score' : [],
          'roc_auc' : []
         }
scorings = {}

for train, test in kfold.split(df_train, y = df_train.isFlaggedFraud):
    
    anom = setup(data = df_train.iloc[train, :], 
             ignore_features = ['isFraud', 'isFlaggedFraud'], 
             categorical_features = ['Cash_in', 'Cash_out', 'Debit', 'Payment', 'Transfer'],
             normalize = True,
             normalize_method = 'minmax',
             silent = True,
             use_gpu = True,
             verbose = False
            )
    iforest = create_model(model = 'iforest', fraction = 0.05, verbose = False)
    
    predictions = predict_model(iforest, data = df_train.iloc[test, :])
    
    y_true = df_train.iloc[test, -2]
    y_pred = predictions.loc[:, 'Anomaly']

    scores['accuracy'].append(accuracy_score(y_true = y_true, y_pred = y_pred))
    scores['recall'].append(recall_score(y_true = y_true, y_pred = y_pred))
    scores['precision'].append(precision_score(y_true = y_true, y_pred = y_pred))
    scores['f1_score'].append(f1_score(y_true = y_true, y_pred = y_pred))
    scores['roc_auc'].append(roc_auc_score(y_true = y_true, y_score = y_pred))

for score in scores:
    scorings[score + '_mean'] = np.mean(scores[score])
    scorings[score + '_std'] = np.std(scores[score])
    
scorings

In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, accuracy_score
from hyperopt import fmin, hp, tpe, space_eval, Trials

# Definición de la función objetivo.
def objective_func(search_space):

    global counter, scorings
    start_time = time.time()
    counter += 1    
    
    kfold = StratifiedKFold(n_splits = 2, shuffle = True, random_state = 42)
    scores = {'accuracy' : [],
              'recall' : [],
              'precision' : [],
              'f1_score' : [],
              'roc_auc' : []
             }
    scorings_trial = {}

    for train, test in kfold.split(df_train, y = df_train.isFlaggedFraud):

        anom = setup(data = df_train.iloc[train, :], 
                     ignore_features = ['isFraud', 'isFlaggedFraud'], 
                     categorical_features = ['Cash_in', 'Cash_out', 'Debit', 'Payment', 'Transfer'],
                     normalize = True,
                     normalize_method = search_space['scaler'],
                     silent = True,
                     use_gpu = True,
                     verbose = False
                )
        
        iforest = create_model(model = 'iforest', fraction = search_space['fraction'], verbose = False)
        
        predictions = predict_model(iforest, data = df_train.iloc[test, :])
        
        y_true = df_train.iloc[test, -2]
        y_pred = predictions.loc[:, 'Anomaly']

        scores['accuracy'].append(accuracy_score(y_true = y_true, y_pred = y_pred))
        scores['recall'].append(recall_score(y_true = y_true, y_pred = y_pred))
        scores['precision'].append(precision_score(y_true = y_true, y_pred = y_pred))
        scores['f1_score'].append(f1_score(y_true = y_true, y_pred = y_pred))
        scores['roc_auc'].append(roc_auc_score(y_true = y_true, y_score = y_pred))
    
    scorings_trial['trial'] = counter
    scorings_trial['search_space'] = str(search_space)

    for score in scores:
        scorings_trial[score + '_mean'] = np.mean(scores[score])
        scorings_trial[score + '_std'] = np.std(scores[score])

    loss = 1 - scorings_trial['f1_score_mean'] + scorings_trial['f1_score_std']

    scorings.append(scorings_trial)
    
    elapsed_time = time.time() - start_time
    
    print('Trial: %i | Loss: %.4f | Elapsed_time: %.4f seconds' % (counter, loss, elapsed_time))
    
    return loss

In [4]:
# Espacio de búsqueda.
models = ['abod', 'cluster', 'cof', 'iforest', 'histogram', 'knn', 'lof', 'svm', 'pca', 'mcd', 'sod', 'sos']
search_space = {'scaler' : hp.choice('scaler', ['zscore', 'minmax', 'maxabs', 'robust']),
                'model' : hp.choice('model', models),
                'fraction' : hp.randint('fraction', 1 , 100) / 100}

# Ejecución del optimizador.
counter = -1
scorings = []
rstate = np.random.default_rng(42)
trials = Trials() # Para el logging de resultados.
best_params = fmin(fn = objective_func, space = search_space, algo = tpe.suggest, max_evals = 100, trials = trials, rstate = rstate)

Trial: 0 | Loss: 0.9960 | Elapsed_time: 18.5950 seconds
Trial: 1 | Loss: 0.9936 | Elapsed_time: 16.8268 seconds                          
Trial: 2 | Loss: 0.9927 | Elapsed_time: 17.8957 seconds                          
Trial: 3 | Loss: 0.9879 | Elapsed_time: 17.4905 seconds                          
Trial: 4 | Loss: 0.9955 | Elapsed_time: 17.2743 seconds                          
Trial: 5 | Loss: 0.9940 | Elapsed_time: 16.5974 seconds                          
Trial: 6 | Loss: 0.9767 | Elapsed_time: 16.6526 seconds                          
Trial: 7 | Loss: 0.9966 | Elapsed_time: 16.8212 seconds                          
Trial: 8 | Loss: 0.9934 | Elapsed_time: 17.0364 seconds                          
Trial: 9 | Loss: 0.9950 | Elapsed_time: 16.8877 seconds                          
Trial: 10 | Loss: 0.9956 | Elapsed_time: 17.2701 seconds                          
Trial: 11 | Loss: 0.9888 | Elapsed_time: 17.4749 seconds                          
Trial: 12 | Loss: 0.9933 | Elapsed_time:

In [7]:
import os
import json
from datetime import datetime
from hyperopt import fmin, hp, tpe, space_eval, Trials

# Función para la impresión de resultados.
def hyperopt_printer(trials):
    print('\nBest Trial:')
    print('Trial ID :', trials.best_trial['tid'])
    print('Loss :', trials.best_trial['result']['loss'])
    print('Params :', space_eval(search_space, best_params), end = '\n\n')

# Función para exportación de resultados a formato JSON.
def json_export(scorings):
    try:
        os.mkdir('HyperOpt Optimizations')
    except:
        None

    now = datetime.now()
    dt_string = now.strftime("%Y%m%d_%H%M%S")
    path = os.getcwd() + '\\HyperOpt Optimizations\\'
    filename = 'Opt_' + dt_string + '.json'

    f = open(path + filename, 'w')
    json.dump(scorings, f)
    f.close()
    print('Json file: ' + dt_string + '.json', end = '\n\n')

# Función para la obtención de resultados en formato tabla para visualizar scorings y desvios estándar.
def scorings_to_df(scorings):
    scorings_df = pd.DataFrame(columns = list(scorings[0]['scorings'].keys()))
    for scoring in scorings:
        scorings_df = scorings_df.append(scoring['scorings'], ignore_index = True)

    print('Scoring details:', end = '\n\n')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):  # more options can be specified also
        print(scorings_df)

In [8]:
hyperopt_printer(trials)
json_export(scorings)
scorings_to_df(scorings)


Best Trial:
Trial ID : 51
Loss : 0.9583333333333333
Params : {'fraction': 0.02, 'model': 'histogram', 'scaler': 'robust'}

Json file: 20220309_011343.json



KeyError: 'scorings'

In [None]:
# plot_model(anom_model, plot = 'tsne')
# plot_model(anom_model, plot = 'umap')

In [None]:
df_test = df[100000:200000]
predictions = predict_model(iforest, data = df_test)
predictions.head()

In [None]:
save_model(model = iforest, model_name = 'iforest_model')

In [None]:
loaded_model = load_model('iforest_model')
type(loaded_model)

In [None]:
df_test = df[100000:200000]
loaded_model.predict(df_test)

In [None]:
loaded_model.predict_proba(df_test)

In [None]:
loaded_model.decision_function(df_test)