In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import torch
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
sns.set(rc = {'figure.figsize':(15,8)})

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fraud-detection/fraudTest.csv
/kaggle/input/fraud-detection/fraudTrain.csv


In [5]:
SEED = 42

if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

Selected device is cuda


## Loading and preprocessing the data 

In [147]:
df=pd.read_csv('../input/fraud-detection/fraudTrain.csv')
df.drop_duplicates(inplace=True)
df = df.drop('Unnamed: 0', axis=1)
df['age']=dt.date.today().year-pd.to_datetime(df['dob']).dt.year
df['hour']=pd.to_datetime(df['trans_date_trans_time']).dt.hour
df['daily']=pd.to_datetime(df['trans_date_trans_time']).dt.day
df['day']=pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
df['month']=pd.to_datetime(df['trans_date_trans_time']).dt.month

In [148]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['category_encoded'] = labelencoder.fit_transform(df['category'])
df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
df['city_encoded'] = labelencoder.fit_transform(df['city'])
df['state_encoded'] =labelencoder.fit_transform(df['state'])
df['job_encoded'] = labelencoder.fit_transform(df['job'])

In [149]:
X = df[['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month', 'is_fraud']]
input_features = ['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month']

#### Spliting the training set into training (90%) and validation(10%) set

In [190]:
df_train, df_val = train_test_split(X, test_size=0.1, random_state=42, stratify=X['is_fraud'])

### We scale the data

In [191]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(df_train[input_features])

df_train[input_features]=scaler.transform(df_train[input_features])
df_val[input_features]=scaler.transform(df_val[input_features])

In [152]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
X_val = df_val.iloc[:,:-1]
y_val = df_val.iloc[:,-1]

### We convert our features and labels into torch tensors

In [153]:
X_train_torch = torch.FloatTensor(X_train.values)
X_val_torch = torch.FloatTensor(X_val.values)
y_train_torch = torch.FloatTensor(y_train.values)
y_val_torch = torch.FloatTensor(y_val.values)

#### As we all know that in case of fraud detection, companies do not always have a labeled historical data. So for this reason why we tried to rely on unsupervised learning models. We will also rely on a new dataset which receives the descriptive features of a transaction and returns it as both input and output

In [168]:
class FraudDataset(torch.utils.data.Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        item = self.x[index].to(DEVICE)
        if self.output:
            return item, item
        else:
            return item

In [169]:
train_set = FraudDataset(X_train_torch)
val_set = FraudDataset(X_val_torch)

In [170]:
train_loader_params = {'batch_size': 64,
              'shuffle': True,
              'num_workers': 0}
valid_loader_params = {'batch_size': 64,
              'num_workers': 0}
    
training_generator = torch.utils.data.DataLoader(train_set, **train_loader_params)
valid_generator = torch.utils.data.DataLoader(val_set, **valid_loader_params)

#### We resorted to a regular feed-forward autoencoder with the following architecture:
* A first input layer with ReLu activation (input_size, intermediate_size)

* A second layer with ReLu activation (intermediate_size, code_size)

* A third layer with ReLu activation (code_size, intermediate_size)

* An output layer with linear activation (intermediate_size, input_size)

In [171]:
class Autoencoder(torch.nn.Module):
    
        def __init__(self, input_size, intermediate_size, code_size):
            super(Autoencoder, self).__init__()
            # parameters
            self.input_size = input_size
            self.intermediate_size = intermediate_size           
            self.code_size  = code_size
            
            self.relu = torch.nn.ReLU()   
            
            #encoder
            self.fc1 = torch.nn.Linear(self.input_size, self.intermediate_size)
            self.fc2 = torch.nn.Linear(self.intermediate_size, self.code_size)
            
            #decoder 
            self.fc3 = torch.nn.Linear(self.code_size, self.intermediate_size)            
            self.fc4 = torch.nn.Linear(self.intermediate_size, self.input_size)
            

            
            
        def forward(self, x):
            
            hidden = self.fc1(x)
            hidden = self.relu(hidden)
            
            code = self.fc2(hidden)
            code = self.relu(code)
 
            hidden = self.fc3(code)
            hidden = self.relu(hidden)
            
            output = self.fc4(hidden)
            
            return output

In [172]:
criterion = torch.nn.MSELoss().to(DEVICE)

#### Our goal is to predict the input from the input. Therefore, one cannot directly use its prediction for fraud detection. Instead, the idea is to use its reconstruction error between the input and the output.

#### the reconstruction error can be considered as predicted fraud score, and therfore the higher the error, the higher the risk.

In [173]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [174]:
import time
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=4,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for x_batch, y_batch in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(x_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), y_batch)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses


In [175]:
class EarlyStopping:
    
    def __init__(self, patience=4, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.Inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 


In [176]:
def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [177]:
model = Autoencoder(len(input_features), 200,12).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

#### Let's start out training, we set our max epochs to 200, and patience to 1 to stop optimization with early stopping using validation data

In [178]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,
                                                                        training_generator,
                                                                        valid_generator,
                                                                        optimizer,
                                                                        criterion,
                                                                        max_epochs=200,
                                                                        patience=1,
                                                                        verbose=True)


Epoch 0: train loss: 0.03401328959178411
valid loss: 0.0008575627067988737
New best score: 0.0008575627067988737

Epoch 1: train loss: 0.00045932396342689635
valid loss: 0.0001560099980369813
New best score: 0.0001560099980369813

Epoch 2: train loss: 0.00013935975172911624
valid loss: 3.8717436811905794e-05
New best score: 3.8717436811905794e-05

Epoch 3: train loss: 8.777027623722631e-05
valid loss: 1.420720746735616e-05
New best score: 1.420720746735616e-05

Epoch 4: train loss: 4.7720780716988864e-05
valid loss: 8.61651375510294e-05
1  iterations since best score.

Epoch 5: train loss: 4.503266017964341e-05
valid loss: 5.232725792191601e-06
New best score: 5.232725792191601e-06

Epoch 6: train loss: 4.83975129707015e-05
valid loss: 4.047768751283106e-06
New best score: 4.047768751283106e-06

Epoch 7: train loss: 3.755724020006684e-05
valid loss: 4.486010807356494e-06
1  iterations since best score.

Epoch 8: train loss: 0.00012961280025080733
valid loss: 2.307089901065077e-06
New 

In [183]:
losses = per_sample_mse(model, valid_generator)
print(np.mean(losses))

0.000182184


In [193]:
print(X_train_torch[0])
print(model(X_train_torch[0].to(DEVICE)))

tensor([-0.0580, -0.2947, -0.9100,  0.3662,  0.5107, -0.2778, -1.7565,  0.8776,
         0.0288,  0.3861, -0.0321, -0.9196])
tensor([-0.0595, -0.2905, -0.9045,  0.3668,  0.5084, -0.2832, -1.7517,  0.8791,
         0.0308,  0.3974, -0.0345, -0.9281], device='cuda:0',
       grad_fn=<AddBackward0>)


In [194]:
genuine_losses = np.array(losses)[y_val_torch.cpu().numpy() == 0]
fraud_losses = np.array(losses)[y_val_torch.cpu().numpy() == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 0.0005219125
Average genuine reconstruction error: 0.00018020491


#### After comparing the reconstructed error between fraud and genuine transactions, it appears that fraud are less well reconstructed than genuine transactions

In [195]:
predictions_df_AE=df_val
predictions_df_AE['predictions_loss']=losses

In [196]:
threshold_AE = np.percentile(predictions_df['predictions_loss'],95)

In [197]:
predictions_df_AE['predictions'] = [1 if x > threshold_AE else 0 for x in predictions_df_AE['predictions_loss']]
predictions_df_AE

Unnamed: 0,category_encoded,amt,gender_encoded,city_encoded,state_encoded,city_pop,job_encoded,age,hour,daily,day,month,is_fraud,predictions_loss,predictions
295133,-0.568910,0.216224,-0.909987,-0.337550,0.929411,-0.248573,-1.206866,1.395357,-0.851290,1.065584,0.877664,-0.334296,0,0.000021,0
309036,0.197485,-0.231450,1.098917,1.251784,-1.164268,-0.264084,1.555412,0.244683,1.348840,1.518580,-0.486978,-0.334296,0,0.000092,0
877558,-1.590770,0.604477,-0.909987,1.429666,1.138779,0.183478,1.291318,-0.618322,1.202164,0.612587,0.877664,1.714365,0,0.000090,0
696676,-0.313445,-0.098344,-0.909987,0.215430,1.138779,-0.285640,0.634652,0.187150,0.615463,0.952334,-0.032098,1.129033,0,0.000067,0
351866,-1.079840,-0.087470,1.098917,1.688754,1.557515,-0.275749,-0.457411,-0.388187,-0.411264,-0.180158,0.422783,-0.041631,0,0.000040,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115045,-1.590770,0.142411,-0.909987,1.124173,-0.466375,-0.292969,-0.007738,-0.158052,0.908814,-0.859653,-0.486978,-0.626962,0,0.000099,0
81445,-0.057980,0.372293,-0.909987,0.996563,1.138779,-0.233963,-0.086252,-0.963524,1.495515,0.159590,1.332544,-1.212294,0,0.000026,0
1105080,-0.568910,0.283193,1.098917,0.192228,-0.187218,-0.290813,1.583962,-0.042985,-1.291316,-1.199400,1.332544,-0.626962,0,0.000031,0
838805,0.452949,-0.405876,-0.909987,-1.478312,0.999200,-0.267485,-0.136216,1.913160,-0.704615,-0.293407,0.422783,1.714365,0,0.000024,0


In [198]:
from sklearn import metrics
def performance_assessment(predictions_df, output_feature='is_fraud', 
                           prediction_feature='predictions', rounded=True):
    
    AUC_ROC = metrics.roc_auc_score(predictions_df[output_feature], predictions_df[prediction_feature])
    AP = metrics.average_precision_score(predictions_df[output_feature], predictions_df[prediction_feature])
    
    performances = pd.DataFrame([[AUC_ROC, AP]], 
                           columns=['AUC ROC','Average precision'])
    performances = performances.round(3)
    
    return performances

In [200]:
performance_assessment(predictions_df)

Unnamed: 0,AUC ROC,Average precision
0,0.749,0.037


## Isolation Forest VS AutoEncoder

#### Isolation forests are an ensemble machine learning algorithm used for anomaly detection. They work by randomly partitioning the feature space and then making a decision tree. Points that are easier to isolate (i.e., require fewer splits) are considered to be anomalies. This approach is effective when the anomalies are well separated from the normal data points, and the anomalies are more scattered throughout the feature space.

In [154]:
from sklearn.ensemble import IsolationForest

anomalyclassifier = IsolationForest(random_state=SEED, n_estimators=40)
anomalyclassifier.fit(df_train[input_features])

IsolationForest(n_estimators=40, random_state=42)

In [159]:
predictions_df_IF = df_val
predictions_df_IF['predictions_prob'] = -anomalyclassifier.score_samples(df_val[input_features])

In [164]:
threshold = np.percentile(predictions_df_IF['predictions_prob'],95)

In [165]:
predictions_df_IF['predictions'] = [1 if x > threshold else 0 for x in predictions_df_IF['predictions_prob']]
predictions_df_IF

Unnamed: 0,category_encoded,amt,gender_encoded,city_encoded,state_encoded,city_pop,job_encoded,age,hour,daily,day,month,is_fraud,predictions_prob,predictions
295133,-0.568910,0.216224,-0.909987,-0.337550,0.929411,-0.248573,-1.206866,1.395357,-0.851290,1.065584,0.877664,-0.334296,0,0.479600,0
309036,0.197485,-0.231450,1.098917,1.251784,-1.164268,-0.264084,1.555412,0.244683,1.348840,1.518580,-0.486978,-0.334296,0,0.533265,0
877558,-1.590770,0.604477,-0.909987,1.429666,1.138779,0.183478,1.291318,-0.618322,1.202164,0.612587,0.877664,1.714365,0,0.570037,1
696676,-0.313445,-0.098344,-0.909987,0.215430,1.138779,-0.285640,0.634652,0.187150,0.615463,0.952334,-0.032098,1.129033,0,0.472817,0
351866,-1.079840,-0.087470,1.098917,1.688754,1.557515,-0.275749,-0.457411,-0.388187,-0.411264,-0.180158,0.422783,-0.041631,0,0.473646,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115045,-1.590770,0.142411,-0.909987,1.124173,-0.466375,-0.292969,-0.007738,-0.158052,0.908814,-0.859653,-0.486978,-0.626962,0,0.467666,0
81445,-0.057980,0.372293,-0.909987,0.996563,1.138779,-0.233963,-0.086252,-0.963524,1.495515,0.159590,1.332544,-1.212294,0,0.521957,0
1105080,-0.568910,0.283193,1.098917,0.192228,-0.187218,-0.290813,1.583962,-0.042985,-1.291316,-1.199400,1.332544,-0.626962,0,0.484813,0
838805,0.452949,-0.405876,-0.909987,-1.478312,0.999200,-0.267485,-0.136216,1.913160,-0.704615,-0.293407,0.422783,1.714365,0,0.490951,0


In [166]:
performance_assessment(predictions_df_IF)

Unnamed: 0,AUC ROC,Average precision
0,0.713,0.029


# Conclusion

### Both autoencoders and isolation forests have their strengths and weaknesses, and the best choice for a particular problem will depend on the characteristics of the data and the specific requirements of the problem. In some cases, a combination of both techniques might produce the best results.

### In conclusion, for the problem at hand, AutoEncoder was able to detect fraud more accuractly than our Isolation Forest (AUC_ROC Score of 0.75 vs 0.71)