In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import distance

In [None]:
#importing the dataset
def import_data(dataset):
    data = pd.read_csv(dataset)
    return data

#dataframe division nr fraud and non-fraud
def divide_data(data,fraude,nfraude,s):
    groups = data.groupby(data.Class)
    df_fraude = groups.get_group(1)
    df_nofraude = groups.get_group(0)
    random_df_fraude = df_fraude.sample(n=fraude,random_state=s)
    random_df_nofraude = df_nofraude.sample(n=nfraude,random_state=s)
    frames = [random_df_fraude, random_df_nofraude]
    matriz_fraude = pd.concat(frames)
    return matriz_fraude

#matrix cleaning
def limpa_matriz(matriz_fraude):
    matriz = matriz_fraude.copy()
    del matriz['Time']
    del matriz['Amount']
    del matriz['Class']
    matrizfinal = matriz.to_numpy()
    return matrizfinal

#euclidean distance Matrix Calculation
def distanciaeuc(matrizfinal):
    mdisteuc = np.zeros((m,m))
    for i in range(0, m):
        for j in range (0, m):
            mdisteuc[j,i] = distance.euclidean(matrizfinal[i], matrizfinal[j])
    return mdisteuc


#jensen-shannon distance matrix calculation
def distanciajen(matrizfinal):
    mdistjen = np.zeros((m,m))
    for i in range(0, m):
        for j in range (0, m):
            mdistjen[j,i] = distance.jensenshannon(matrizfinal[i], matrizfinal[j])
    return mdistjen

#removal of Nodes - sum of columns
def retirada_nodes(mdist):
    msomaaux = np.zeros((m), dtype=np.int64)
    msomaaux = mdist.sum(axis=1)
    node = np.arange(m)
    for i in range(0, m):
        for j in range (i+1,m):
            if (msomaaux[j] > msomaaux[i]):
                aux1 = msomaaux[i]
                msomaaux[i] = msomaaux[j]
                msomaaux[j] = aux1
                aux2 = node[i]
                node[i] = node[j]
                node[j] = aux2
    return node

#probability histogram calculation
def probabilidade(mdistnew):
    mdistnew = [[x for idxx, x in enumerate(X) if idxx != idx] for idx, X in enumerate(mdisteuc)]
    mdistnew = np.array(mdistnew)
    max_d = mdistnew.max(axis=1)
    min_d = mdistnew.min(axis=1)
    N=len(mdistnew)
    k = (N-1)//10
    P = np.zeros((N,k), dtype=np.float64)

    for i in range(0, N):
        max_i = max_d[i] 
        min_i = min_d[i] 
        d = mdistnew[i, :]
        bins = np.linspace(min_i, max_i, k+1)
        hist, bins_ = np.histogram(d, bins=bins)
        prob = hist / (N-1)
        P[i] = prob
    return P

### Download the data file from: www.kaggle.com/datasets/mlg-ulb/creditcardfraud and save in the current directory

In [None]:
!mkdir datam
!mkdir dataos

In [None]:
# Importar dataset
dataset = "creditcard.csv"
data = import_data(dataset)

fraude = 10
nfraude = 100

rep = 200

numero_simulacao = fraude + nfraude

#initialize array to receive OS
OS1 = np.zeros((rep,fraude+nfraude), dtype=np.int64)
OS2 = np.zeros((rep,fraude+nfraude), dtype=np.int64)

#initialize array to receive OS1 and OS2 probs
OS1_probs = np.zeros((rep,fraude+nfraude), dtype=np.float64)
OS2_probs = np.zeros((rep,fraude+nfraude), dtype=np.float64)

csv_diretorio = "datam/"

for s in range(0, rep):
    
    # Split the DataFrame data into two parts based on the value of the Class column.
    # Rows with the same value as the Class column will be placed in the same group.
    matriz_fraude = divide_data(data, fraude, nfraude, s)

    # clear the matrix
    matrizfinal = limpa_matriz(matriz_fraude)

    # Generate EUCLIDEAN parameter
    m = len(matrizfinal)

    # Calculate the EUCLIDEAN Distance Matrix
    mdisteuc = distanciaeuc(matrizfinal)
    
    #normalize EUCLIDEAN distance matrix
    mdisteucnorm = mdisteuc/mdisteuc.max()   
    
    #Write Euclidean matrices
    csv_nome = csv_diretorio + str(numero_simulacao) + '_EUC_' + str(s) + '.csv'
    pd.DataFrame(mdisteucnorm).to_csv(csv_nome, header=None, index=None)
    
    # First Outlier Score - Column Sum
    OS1cs = retirada_nodes(mdisteucnorm)
    
    #calculate probability histogram
    P = probabilidade(mdisteucnorm)
    
    #Generate parameter JENSEN SHANNON
    m = len(P) # tamanho
    
    #Calculate the JENSEN-SHANNON Distance Matrix
    mdistjen = distanciajen(P) 

    #normalize JENSEN-SHANNON distance matrix
    mdistjennorm = mdistjen/mdistjen.max()
    
    #Write JENSEN-SHANNON matrices
    csv_nome = csv_diretorio + str(numero_simulacao) + '_JEN_' + str(s) + '.csv'
    pd.DataFrame(mdistjennorm).to_csv(csv_nome, header=None, index=None)
    
    # Second Outlier Score - Column Sum
    OS2cs = retirada_nodes(mdistjennorm)
    
    #fills matrix from model OS1
    OS1[s] = OS1cs
    
    #fills matrix from model OS2
    OS2[s] = OS2cs
    
    #fills array OS1_probs with sum of columns
    OS1_probs[s] = mdisteucnorm.sum(axis=1)
    
    #fills array OS2_probs with sum of columns
    OS2_probs[s] = mdistjennorm.sum(axis=1)

csv_diretorio = "dataos/"

#Write OS1 Matrix
csv_nome = csv_diretorio + str(numero_simulacao) + '_OS1' + '.csv'
pd.DataFrame(OS1).to_csv(csv_nome, header=None, index=None)

#Write OS2 Matrix
csv_nome = csv_diretorio + str(numero_simulacao) + '_OS2' + '.csv'
pd.DataFrame(OS2).to_csv(csv_nome, header=None, index=None)

#Write OS1_probs Matrix
csv_nome = csv_diretorio + str(numero_simulacao) + '_OS1_probs' + '.csv'
pd.DataFrame(OS1_probs).to_csv(csv_nome, header=None, index=None)

#Write OS2_probs Matrix
csv_nome = csv_diretorio + str(numero_simulacao) + '_OS2_probs' + '.csv'
pd.DataFrame(OS2_probs).to_csv(csv_nome, header=None, index=None)