In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fraud-detection/fraudTest.csv
/kaggle/input/fraud-detection/fraudTrain.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import torch
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
sns.set(rc = {'figure.figsize':(15,8)})

In [3]:
SEED = 42

if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

Selected device is cuda


In [4]:
df=pd.read_csv('../input/fraud-detection/fraudTrain.csv')
df.drop_duplicates(inplace=True)
df = df.drop('Unnamed: 0', axis=1)
df['age']=dt.date.today().year-pd.to_datetime(df['dob']).dt.year
df['hour']=pd.to_datetime(df['trans_date_trans_time']).dt.hour
df['daily']=pd.to_datetime(df['trans_date_trans_time']).dt.day
df['day']=pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
df['month']=pd.to_datetime(df['trans_date_trans_time']).dt.month

In [5]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['category_encoded'] = labelencoder.fit_transform(df['category'])
df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
df['city_encoded'] = labelencoder.fit_transform(df['city'])
df['state_encoded'] =labelencoder.fit_transform(df['state'])
df['job_encoded'] = labelencoder.fit_transform(df['job'])

In [6]:
X = df[['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month', 'is_fraud']]
input_features = ['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month']

In [7]:
df_train, df_val = train_test_split(X, test_size=0.1, random_state=42, stratify=X['is_fraud'])

In [8]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
X_val = df_val.iloc[:,:-1]
y_val = df_val.iloc[:,-1]

In [9]:
X_train_torch = torch.FloatTensor(X_train.values)
X_val_torch = torch.FloatTensor(X_val.values)
y_train_torch = torch.FloatTensor(y_train.values)
y_val_torch = torch.FloatTensor(y_val.values)

In [10]:
class FraudDataset(torch.utils.data.Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        item = self.x[index].to(DEVICE)
        if self.output:
            return item, item
        else:
            return item

In [11]:
train_set = FraudDataset(X_train_torch)
val_set = FraudDataset(X_val_torch)

In [12]:
train_loader_params = {'batch_size': 64,
              'shuffle': True,
              'num_workers': 0}
valid_loader_params = {'batch_size': 64,
              'num_workers': 0}
    
training_generator = torch.utils.data.DataLoader(train_set, **train_loader_params)
valid_generator = torch.utils.data.DataLoader(val_set, **valid_loader_params)

In [58]:
class Autoencoder(torch.nn.Module):
    
        def __init__(self, input_size, intermediate_size, code_size):
            super(Autoencoder, self).__init__()
            # parameters
            self.input_size = input_size
            self.intermediate_size = intermediate_size           
            self.code_size  = code_size
            
            self.relu = torch.nn.ReLU()
            self.segmoid = torch.nn.Sigmoid()
            
            #encoder
            self.fc1 = torch.nn.Linear(self.input_size, self.intermediate_size)
            self.fc2 = torch.nn.Linear(self.intermediate_size, self.code_size)
            
            #decoder 
            self.fc3 = torch.nn.Linear(self.code_size, self.intermediate_size)            
            self.fc4 = torch.nn.Linear(self.intermediate_size, self.input_size)
            
            
        def forward(self, x):
            
            hidden = self.fc1(x)
            hidden = self.relu(hidden)
            
            code = self.fc2(hidden)
            code = self.relu(code)
 
            hidden = self.fc3(code)
            hidden = self.relu(hidden)
            
            output = self.fc4(hidden)
            #linear activation in final layer)
            
            return output

In [59]:
criterion = torch.nn.MSELoss().to(DEVICE)

In [60]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [61]:
model = Autoencoder(X_train.shape[1], 200, 12).to(DEVICE)
losses = per_sample_mse(model, valid_generator)

In [62]:
print(losses[0:5])
print(np.mean(losses))

[15471310.0, 6767908.0, 1707036500.0, 554758.0, 2566253.2]
8162682400.0


In [63]:
model = Autoencoder(len(input_features), 200, 12).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [64]:
import time
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=2,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for x_batch, y_batch in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(x_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), y_batch)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

class EarlyStopping:
    
    def __init__(self, patience=2, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.Inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 

def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [65]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,
                                                                        training_generator,
                                                                        valid_generator,
                                                                        optimizer,
                                                                        criterion,
                                                                        max_epochs=500,
                                                                        verbose=True)


Epoch 0: train loss: 76734301.46514669
valid loss: 970.482580266801
New best score: 970.482580266801

Epoch 1: train loss: 51919.14849871632
valid loss: 863.8479537926229
New best score: 863.8479537926229

Epoch 2: train loss: 44586.37486433652
valid loss: 480.12227180642805
New best score: 480.12227180642805

Epoch 3: train loss: 36664.359404450675
valid loss: 820.0335963211097
1  iterations since best score.

Epoch 4: train loss: 31559.604333010757
valid loss: 9486.942418362392
2  iterations since best score.

Epoch 5: train loss: 30447.72239967704
valid loss: 343.18090905188103
New best score: 343.18090905188103

Epoch 6: train loss: 32415.342806842826
valid loss: 370.6929340955361
1  iterations since best score.

Epoch 7: train loss: 32921.89269580344
valid loss: 102.1667294918638
New best score: 102.1667294918638

Epoch 8: train loss: 22484.838664798826
valid loss: 64216.99990807088
1  iterations since best score.

Epoch 9: train loss: 28819.596525011304
valid loss: 2125.62932362

In [66]:
losses = per_sample_mse(model, valid_generator)
print(losses[0:5])
print(np.mean(losses))

[10.739586, 10.080604, 62.831764, 3.895421, 41.650272]
123.954735


In [67]:
genuine_losses = np.array(losses)[y_val_torch.cpu().numpy() == 0]
fraud_losses = np.array(losses)[y_val_torch.cpu().numpy() == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 294.92673
Average genuine reconstruction error: 122.958755


In [68]:
predictions_df=df_val
predictions_df['predictions']=losses

In [70]:
from sklearn import metrics
metrics.roc_auc_score(predictions_df["is_fraud"], predictions_df["predictions"])

0.7724108649598525

In [71]:
metrics.average_precision_score(predictions_df["is_fraud"], predictions_df["predictions"])

0.015667193300778058

In [72]:
metrics.classification_report(predictions_df["is_fraud"], predictions_df["predictions"])

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [74]:
predictions_df['predictions'].apply(torch.nn.Sigmoid())

TypeError: sigmoid(): argument 'input' (position 1) must be Tensor, not float

In [115]:
from sklearn.ensemble import IsolationForest

anomalyclassifier = IsolationForest(random_state=SEED, n_estimators=40)
anomalyclassifier.fit(df_train[input_features])

IsolationForest(n_estimators=40, random_state=42)

In [116]:
predictions_df = df_val
predictions_df['predictions'] = -anomalyclassifier.score_samples(df_val[input_features])

In [117]:
metrics.roc_auc_score(predictions_df["is_fraud"], predictions_df["predictions"])

0.8566069207897851