In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fraud-detection/fraudTest.csv
/kaggle/input/fraud-detection/fraudTrain.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

import torch
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
sns.set(rc = {'figure.figsize':(15,8)})

In [3]:
SEED = 42

if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

Selected device is cuda


In [51]:
df=pd.read_csv('../input/fraud-detection/fraudTrain.csv')
df.drop_duplicates(inplace=True)
df = df.drop('Unnamed: 0', axis=1)
df['age']=dt.date.today().year-pd.to_datetime(df['dob']).dt.year
df['hour']=pd.to_datetime(df['trans_date_trans_time']).dt.hour
df['daily']=pd.to_datetime(df['trans_date_trans_time']).dt.day
df['day']=pd.to_datetime(df['trans_date_trans_time']).dt.dayofweek
df['month']=pd.to_datetime(df['trans_date_trans_time']).dt.month

In [52]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['category_encoded'] = labelencoder.fit_transform(df['category'])
df['gender_encoded'] = labelencoder.fit_transform(df['gender'])
df['city_encoded'] = labelencoder.fit_transform(df['city'])
df['state_encoded'] =labelencoder.fit_transform(df['state'])
df['job_encoded'] = labelencoder.fit_transform(df['job'])

In [53]:
X = df[['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month', 'is_fraud']]
input_features = ['category_encoded', 'amt', 'gender_encoded', 'city_encoded', 'state_encoded', 'city_pop', 'job_encoded', 'age', 'hour', 'daily', 'day', 'month']

In [54]:
df_train, df_val = train_test_split(X, test_size=0.1, random_state=42, stratify=X['is_fraud'])

In [55]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(df_train[input_features])

df_train[input_features]=scaler.transform(df_train[input_features])
df_val[input_features]=scaler.transform(df_val[input_features])

In [56]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]
X_val = df_val.iloc[:,:-1]
y_val = df_val.iloc[:,-1]

In [57]:
X_train_torch = torch.FloatTensor(X_train.values)
X_val_torch = torch.FloatTensor(X_val.values)
y_train_torch = torch.FloatTensor(y_train.values)
y_val_torch = torch.FloatTensor(y_val.values)

In [58]:
class FraudDataset(torch.utils.data.Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        item = self.x[index].to(DEVICE)
        if self.output:
            return item, item
        else:
            return item

In [59]:
train_set = FraudDataset(X_train_torch)
val_set = FraudDataset(X_val_torch)

In [60]:
train_loader_params = {'batch_size': 64,
              'shuffle': True,
              'num_workers': 0}
valid_loader_params = {'batch_size': 64,
              'num_workers': 0}
    
training_generator = torch.utils.data.DataLoader(train_set, **train_loader_params)
valid_generator = torch.utils.data.DataLoader(val_set, **valid_loader_params)

In [61]:
class Autoencoder(torch.nn.Module):
    
        def __init__(self, input_size, intermediate_size, code_size):
            super(Autoencoder, self).__init__()
            # parameters
            self.input_size = input_size
            self.intermediate_size = intermediate_size           
            self.code_size  = code_size
            
            self.relu = torch.nn.ReLU()   
            
            #encoder
            self.fc1 = torch.nn.Linear(self.input_size, self.intermediate_size)
            self.fc2 = torch.nn.Linear(self.intermediate_size, self.code_size)
            
            #decoder 
            self.fc3 = torch.nn.Linear(self.code_size, self.intermediate_size)            
            self.fc4 = torch.nn.Linear(self.intermediate_size, self.input_size)
            
            
        def forward(self, x):
            
            hidden = self.fc1(x)
            hidden = self.relu(hidden)
            
            code = self.fc2(hidden)
            code = self.relu(code)
 
            hidden = self.fc3(code)
            hidden = self.relu(hidden)
            
            output = self.fc4(hidden)
            #linear activation in final layer)            
            
            return output

In [62]:
criterion = torch.nn.MSELoss().to(DEVICE)

In [63]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [64]:
model = Autoencoder(X_train.shape[1], 200, 20).to(DEVICE)
losses = per_sample_mse(model, valid_generator)

In [65]:
print(losses[0:5])
print(np.mean(losses))

[0.6762004, 0.98505133, 1.1826674, 0.44613016, 0.6240214]
1.0510426


In [66]:
model = Autoencoder(len(input_features), 200, 20).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [67]:
import time
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=4,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for x_batch, y_batch in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(x_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), y_batch)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

class EarlyStopping:
    
    def __init__(self, patience=6, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.Inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 

def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [69]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,
                                                                        training_generator,
                                                                        valid_generator,
                                                                        optimizer,
                                                                        criterion,
                                                                        max_epochs=500,
                                                                        patience=2,
                                                                        verbose=True)


Epoch 0: train loss: 0.0008532465994741911
valid loss: 0.00014257329165789982
New best score: 0.00014257329165789982

Epoch 1: train loss: 0.0001308609646210364
valid loss: 2.1291473124453633e-05
New best score: 2.1291473124453633e-05


KeyboardInterrupt: 

In [70]:
losses = per_sample_mse(model, valid_generator)
print(losses[0:5])
print(np.mean(losses))

[3.0029973e-06, 2.7826652e-06, 9.516395e-06, 1.2725758e-06, 4.191075e-06]
6.0822986e-06


In [71]:
genuine_losses = np.array(losses)[y_val_torch.cpu().numpy() == 0]
fraud_losses = np.array(losses)[y_val_torch.cpu().numpy() == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 4.955523e-05
Average genuine reconstruction error: 5.8290498e-06


In [72]:
predictions_df=df_val
predictions_df['predictions']=losses

In [73]:
predictions_df

Unnamed: 0,category_encoded,amt,gender_encoded,city_encoded,state_encoded,city_pop,job_encoded,age,hour,daily,day,month,is_fraud,predictions
295133,-0.568910,0.216224,-0.909987,-0.337550,0.929411,-0.248573,-1.206866,1.395357,-0.851290,1.065584,0.877664,-0.334296,0,0.000003
309036,0.197485,-0.231450,1.098917,1.251784,-1.164268,-0.264084,1.555412,0.244683,1.348840,1.518580,-0.486978,-0.334296,0,0.000003
877558,-1.590770,0.604477,-0.909987,1.429666,1.138779,0.183478,1.291318,-0.618322,1.202164,0.612587,0.877664,1.714365,0,0.000010
696676,-0.313445,-0.098344,-0.909987,0.215430,1.138779,-0.285640,0.634652,0.187150,0.615463,0.952334,-0.032098,1.129033,0,0.000001
351866,-1.079840,-0.087470,1.098917,1.688754,1.557515,-0.275749,-0.457411,-0.388187,-0.411264,-0.180158,0.422783,-0.041631,0,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115045,-1.590770,0.142411,-0.909987,1.124173,-0.466375,-0.292969,-0.007738,-0.158052,0.908814,-0.859653,-0.486978,-0.626962,0,0.000001
81445,-0.057980,0.372293,-0.909987,0.996563,1.138779,-0.233963,-0.086252,-0.963524,1.495515,0.159590,1.332544,-1.212294,0,0.000004
1105080,-0.568910,0.283193,1.098917,0.192228,-0.187218,-0.290813,1.583962,-0.042985,-1.291316,-1.199400,1.332544,-0.626962,0,0.000001
838805,0.452949,-0.405876,-0.909987,-1.478312,0.999200,-0.267485,-0.136216,1.913160,-0.704615,-0.293407,0.422783,1.714365,0,0.000003


In [74]:
from sklearn import metrics
print(metrics.roc_auc_score(predictions_df["is_fraud"], predictions_df["predictions"]))

print(metrics.average_precision_score(predictions_df["is_fraud"], predictions_df["predictions"]))

0.8805869551365572
0.11018754150552329


In [75]:
metrics.average_precision_score(predictions_df["is_fraud"], predictions_df["predictions"])

0.11018754150552329

In [77]:
predictions_df[predictions_df['is_fraud'] == 1]

Unnamed: 0,category_encoded,amt,gender_encoded,city_encoded,state_encoded,city_pop,job_encoded,age,hour,daily,day,month,is_fraud,predictions
1062627,0.452949,4.428480,1.098917,0.838016,-0.815321,-0.293552,0.249218,0.359751,1.495515,0.046341,-1.396740,-0.919628,1,0.000031
4808,-0.824375,-0.357712,1.098917,1.499272,-1.862160,-0.293526,-1.727916,1.970694,-1.878017,-1.312649,0.422783,-1.504960,1,0.000003
207924,1.219344,5.054546,1.098917,-0.588904,1.138779,-0.137939,-0.785743,0.474818,1.495515,-0.066908,-1.396740,-0.626962,1,0.000049
93788,-0.568910,1.602485,-0.909987,0.752942,0.510675,-0.279330,-1.064112,2.776166,-1.437991,1.065584,-1.396740,-1.212294,1,0.000036
197243,-0.568910,1.596217,-0.909987,0.161292,1.138779,-0.288074,-0.999873,0.474818,1.495515,-0.519905,-0.032098,-0.626962,1,0.000016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
975620,1.219344,6.462298,1.098917,1.491538,-1.583003,-0.293582,-0.771468,0.417285,1.495515,1.631829,-0.032098,-1.504960,1,0.000202
554183,-0.568910,1.445329,-0.909987,0.942425,0.929411,-0.283526,-0.421722,0.704953,-1.878017,1.065584,1.332544,0.543701,1,0.000014
522167,1.219344,5.613003,1.098917,1.112572,0.789833,-0.291280,-0.314657,-0.503254,1.348840,-0.406656,-1.396740,0.543701,1,0.000065
908990,0.197485,-0.407987,1.098917,-1.296564,0.650254,-0.292446,1.319869,1.855627,1.348840,1.405331,0.877664,1.714365,1,0.000005


In [32]:
metrics.classification_report(predictions_df["is_fraud"], predictions_df["predictions"])

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [33]:
predictions_df['predictions'].apply(torch.nn.Sigmoid())

TypeError: sigmoid(): argument 'input' (position 1) must be Tensor, not float

In [34]:
from sklearn.ensemble import IsolationForest

anomalyclassifier = IsolationForest(random_state=SEED, n_estimators=40)
anomalyclassifier.fit(df_train[input_features])

IsolationForest(n_estimators=40, random_state=42)

In [35]:
predictions_df = df_val
predictions_df['predictions'] = -anomalyclassifier.score_samples(df_val[input_features])

In [36]:
metrics.roc_auc_score(predictions_df["is_fraud"], predictions_df["predictions"])

0.8566069207897851