# Initialization

In [None]:
def IsNotebook():
    isnotebook = None
    isgooglecolab = None
    shell = None
    
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            isnotebook = True   # Jupyter notebook or qtconsole
            isgooglecolab = False
        elif shell == "Shell":
            isnotebook = True   # Google Colab
            isgooglecolab = True
        elif shell == 'TerminalInteractiveShell':
            isnotebook = False  # Terminal running IPython
            isgooglecolab = False
        else:
            isnotebook = False  # Other type (?)
            isgooglecolab = False
    except NameError:
        isnotebook = False      # Probably standard Python interpreter
        isgooglecolab = False
    return shell, isnotebook, isgooglecolab
shell, isnotebook, isgooglecolab = IsNotebook()

if isnotebook:
    from IPython.core.display import display, HTML
    display(HTML("<style>.container { width:99% !important; }</style>"))
    if not isgooglecolab:
        try: #the jedi completer takes too long to complete words
            %config Completer.use_jedi = False
        except:
            pass

if isgooglecolab:
    try:
        from google.colab import drive
        drive.mount('/content/drive')
    except Error as e:
        print(e)

In [None]:
#######################
### General Imports ###
import os #Making sure we're using all CPU cores for faster calculations
os.environ["OMP_NUM_THREADS"] = str(os.cpu_count())
os.environ["OPENBLAS_NUM_THREADS"] = str(os.cpu_count())
os.environ["MKL_NUM_THREADS"] = str(os.cpu_count())
os.environ["VECLIB_MAXIMUM_THREADS"] = str(os.cpu_count())
os.environ["NUMEXPR_NUM_THREADS"] = str(os.cpu_count())

import sys #Printing version for posterity
print("Python version:", sys.version)

try: #Allows saving and loading of variables
    import pickle5 as pickle
except:
    import pickle
try: #Printing version for posterity
    print("Pickle version:", pickle.__version__)
except:
    print("Pickle version:", pickle.format_version)

import gc #We can force garbage collection to free up RAM
import random #Enables use of random number of random choices
import warnings #Ability to create custom warnings, like warnings.warn("deprecated", DeprecationWarning)
import itertools #Needed for Confusion Matrix

if os.name == 'nt':
    from winsound import Beep #Uses the computer's speakers to alert you (e.g. when training is done)
from tqdm import tqdm #Iterations can show a progress bar (like in Training)
from collections import Counter #Allows for frequency counting similar with R's "table"
#######################


#####################
### Date and Time ###
import time #Gets the current time
import dateutil.parser #Allows for Date objects like dateutil.parser.parse("24/05/2021")
from pytz import timezone #Allows for timezones to be set. #pytz.all_timezones
from datetime import datetime #Allows for Datetime objects like current Datetime. #datetime.fromisoformat('2021-05-24')
#####################


###################
### Mathematics ###
import numpy as np #Working with numeric arrays
print("Numpy version:", np.__version__)
###################


#######################################
### Statistics and Machine Learning ###
#Utility
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler #Various ways of scaling the data
from sklearn.model_selection import train_test_split #Functions for splitting datasets

#Metrics
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
#######################################


##################
### Dataframes ###
import pandas as pd
##################


#############
### Plots ###
import matplotlib.cm as cmap #Importing colormap for plots
import matplotlib.pyplot as plt #Allows use of Pyplot plots

import seaborn as sns #Allows use of Seaborn plots
sns.set() #Sets default plot theme
#############


######################
### String or Text ###
######################
import json #Can encode or decode JSON string objects


###################################
### Files, Directories, Folders ###
from pathlib import Path
###################################


################################
### Neural Network Libraries ###
#General
import torch
import torch.nn as nn
import torch.nn.functional as F
# from torchsummary import summary
if isgooglecolab:
    !pip install torchinfo
from torchinfo import summary #Needs +1 number before conv_input_size

#Data
from torch.utils.data import Dataset, DataLoader, TensorDataset

#Info and configuration
print()
print("PyTorch v" + torch.__version__)
IS_GPU_AVAILABLE = torch.cuda.is_available()
print(f"CUDA device available: {IS_GPU_AVAILABLE}")
if (torch.cuda.is_available()):
    print(str(torch.cuda.device_count()) + " devices available")
    for n in range(torch.cuda.device_count()):
        print("\t" + torch.cuda.get_device_name(n))
    print("cuda:", torch.cuda.current_device()) #It can give you information like the GPU is not supported
print("Num threads set to:", os.cpu_count())
torch.set_num_threads(os.cpu_count())
################################


########################
### Useful functions ###
def ZeroANumber(Number, MaxLength, ForceMaxLength = False):
    res = str(Number).zfill(MaxLength)
    if ForceMaxLength: res = res[:MaxLength]
    return res

def SpaceAString(CurString, MaxLength, SpaceTheFront = True, ForceMaxLength = False, ForceRemoveFromFront = False):
    CurLen = len(CurString)
    Result = CurString
    
    if CurLen < MaxLength:
        if SpaceTheFront:
            Result = (" " * (MaxLength-CurLen)) + CurString
        else:
            Result = CurString + (" " * (MaxLength-CurLen))
    elif CurLen > MaxLength and ForceMaxLength:
        if ForceRemoveFromFront:
            Result = CurString[(CurLen - MaxLength):]
        else:
            Result = CurString[:-(CurLen - MaxLength)]
    return Result

def SaveVariable(Variable, FileName):
    DirName = Path(FileName).parent.absolute()
    os.makedirs(DirName, exist_ok = True)

    with open(FileName, 'wb') as io:
        pickle.dump(Variable, io)
    
def LoadVariable(FileName):
    with open(FileName, "rb") as io:
        Res = pickle.load(io)
    return Res

def init_seeds(seed, ForceCudaDeterministic = False):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if ForceCudaDeterministic:
        torch.backends.cudnn.deterministic = True
        torch.use_deterministic_algorithms(True)


def plot_confusion_matrix(cm, classes, normalise = False, title = 'Confusion matrix', cmap = plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalisation can be applied by setting `normalise=True`.
    """
    if normalise:
        cm = cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalisation')

    print(cm)

    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalise else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment = "center",
                 color = "white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu" #To FORCE CPU
print("device=", device)

# Data

In [None]:
path_root = f"{os.getcwd()}"
path_data = f"{path_root}/Data"
path_models = f"{path_root}/Models"
print(path_data, "\n")
print(path_root)
print(path_models, "\n")

●	Programmatically download and load into your favorite analytical tool the transactions data. 

In [None]:
if not os.path.exists(f"{path_data}/transactions.txt"):
    os.makedirs(f"{path_data}", exist_ok = True)
    
    if os.name == 'nt':
        !curl -o "{path_data}/transactions.zip" "https://raw.githubusercontent.com/CapitalOneRecruiting/DS/master/transactions.zip"

    else:
        !wget "https://raw.githubusercontent.com/CapitalOneRecruiting/DS/master/transactions.zip" -O "{path_data}/transactions.zip"
    !unzip "{path_data}/transactions.zip" -d "{path_data}"

In [None]:
def GetDataFromJSONFile(FilePath):
    with open(FilePath) as fileinput: #Opening the file
        i = 0
        tmp = []
        for line in fileinput.readlines():
            try:
                tmp.append(json.loads(line)) #using JSON library to load the data
            except Exception as exc:
                print(f"Exception on line {i}:\n'{line}'\n\n{exc}")
            finally:
                i += 1

    return pd.DataFrame(tmp)

In [None]:
XY = GetDataFromJSONFile(f"{path_data}/transactions.txt")

XY["transactionDateTime"] = pd.to_datetime(XY["transactionDateTime"], format="%Y-%m-%dT%H:%M:%S")

XY["currentExpDay"] = "01/"
XY["currentExpDate"] = XY["currentExpDay"].str.cat(XY["currentExpDate"], sep = "")
XY["currentExpDate"] = pd.to_datetime(XY["currentExpDate"], format="%d/%m/%Y")
XY = XY.drop(["currentExpDay"], axis = 1)

XY["accountOpenDate"] = pd.to_datetime(XY["accountOpenDate"], format="%Y-%m-%d")
XY["dateOfLastAddressChange"] = pd.to_datetime(XY["dateOfLastAddressChange"], format="%Y-%m-%d")

XY = XY.apply(lambda x: x.astype("int") if isinstance(x, bool) else x).replace('', np.nan)
XY = XY.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace('', np.nan)

●	Please describe the structure of the data. Number of records and fields in each record?
●	Please provide some additional basic summary statistics for each field. Be sure to include a count of null, minimum, maximum, and unique values where appropriate.

In [None]:
XY[XY.columns.values[:15]].describe(include = 'all', datetime_is_numeric = True)

In [None]:
XY[XY.columns.values[15:]].describe(include = 'all', datetime_is_numeric = True)

In [None]:
nObservations = XY.describe(include = 'all', datetime_is_numeric = True).iloc[0, :]
MissingPercent = (len(XY) - nObservations.values.astype("int")) / len(XY) * 100
[print(f"{XY.columns.values[i]}: {MissingPercent[i]:.2f}% missing") for i in range(len(nObservations))]
pass

In [None]:
#These columns are 100% missing, so let's drop them
XY = XY.drop(["echoBuffer", "merchantCity", "merchantState", "merchantZip", "posOnPremises", "recurringAuthInd"], axis = 1)
XY

●	Can you programmatically identify reversed and multi-swipe transactions?

●	What total number of transactions and total dollar amount do you estimate for the reversed transactions? For the multi-swipe transactions? (please consider the first transaction to be "normal" and exclude it from the number of transaction and dollar amount counts)

In [None]:
XY.dtypes

In [None]:
#This is one example of a Reversal.
#We can clearly see that there is 1 row for the actual purchase (the first transaction that is considered "normal")
#Therefore, this kind of duplicate can be programmatically caught by filtering for transactionType == 'REVERSAL'
XY.iloc[38:40, :]

In [None]:
ReversedTransactions = XY[XY["transactionType"] == 'REVERSAL']
print(f"total number of REVERSAL transactions:        {len(ReversedTransactions)},     {(len(ReversedTransactions) / len(XY) * 100):.2f}%")
print(f"total dollar amount of REVERSAL transactions: {ReversedTransactions['transactionAmount'].sum()}, {(ReversedTransactions['transactionAmount'].sum() / XY['transactionAmount'].sum() * 100):.2f}%")

In [None]:
#To capture the duplicate purchases for multi-swipe where one was charged twice, we'll focus on:
#  transactionType=="PURCHASES" and cardPresent==True and transactionAmount>0 where transactionDateTime between 2 consecutive purchases is small

#It seems there are 839 duplicate transactions, which amount to 0.2% of all purchases
XY2 = XY.loc[(XY["transactionAmount"] > 0) & (XY["transactionType"] == 'PURCHASE') & (XY["cardPresent"] == True)][["transactionDateTime", "accountNumber", "customerId", "transactionAmount", "cardLast4Digits", "merchantName"]].sort_values(by=['transactionDateTime'])

#Creating the "Previous_" versions of each variable in question using the shift() function
XY2["Prev_transactionDateTime"] = XY2["transactionDateTime"].shift()
XY2["Prev_accountNumber"] = XY2["accountNumber"].shift()
XY2["Prev_customerId"] = XY2["customerId"].shift()
XY2["Prev_transactionAmount"] = XY2["transactionAmount"].shift()
XY2["Prev_cardLast4Digits"] = XY2["cardLast4Digits"].shift()
XY2["Prev_merchantName"] = XY2["merchantName"].shift()
XY2 = XY2.dropna()

#Creating Duplicate status as discussed above
XY2["IsDuplicate"] = ((XY2["transactionDateTime"] - XY2["Prev_transactionDateTime"]) / pd.Timedelta(seconds = 1) < 60) &\
    (XY2["accountNumber"] == XY2["Prev_accountNumber"]) &\
    (XY2["customerId"] == XY2["Prev_customerId"]) &\
    (XY2["transactionAmount"] == XY2["Prev_transactionAmount"]) &\
    (XY2["cardLast4Digits"] == XY2["Prev_cardLast4Digits"]) &\
    (XY2["merchantName"] == XY2["Prev_merchantName"])

#Counting how many duplicates there are
counter = Counter(XY2["IsDuplicate"].values)
print(counter)

#Seeing the data we used to calculate things for a sanity test
XY2[XY2["IsDuplicate"] == True]

●	Did you find anything interesting about either kind of transaction?

In [None]:
#There seems to be another category of duplicates where the same thing is purchased from the same customer on the same day, but it's not a double-swipe as those would be in a time difference of seconds, whilst some are in minutes, hours, and so on.

#The amount of reversal (2.58%) is not negligible
#Multi-swipe transaction seem to occur more than a near 0 percent.

●	Plot a histogram of the processed amounts of each transaction, the transactionAmount column.

In [None]:
plt.rcParams['figure.figsize'] = [8, 5.5]
print("transactionAmount")
sns.histplot(x = "transactionAmount", data = XY)
plt.show()

The transaction amount seems to follow a kind of power-law distribution

●	Report any structure you find and any hypotheses you have about that structure.

In [None]:
#############################
### Data Hyperparameters ####
Seed = 42 #The answer to Life, the Universe, and Everything.
CustomNAString = None
ImputationType = None #"Zero" "Univariate" "Multivariate"
batch_size = 65536
#############################


#############################
### Reading From One File ###
#Already done above, in Data exploration

XYColumns = XY.columns.values
#################


##################
### Imputation ###
#==For time series: Fill NAs with direction down-up, or mean of previous and next
if ImputationType == "Zero":
    #Trying filling NAs with 0
    X_NAs = np.nan_to_num(X_NAs)

elif ImputationType == "Univariate":
    #Trying Univariate Imputation
    if os.path.exists(f"{path_models}/UnivImput_NAs"):
        UnivImput_NAs = LoadVariable(f"{path_models}/UnivImput_NAs")
    else:
        UnivImput_NAs = SimpleImputer().fit(X_NAs)
        SaveVariable(UnivImput_NAs, f"{path_models}/UnivImput_NAs")
    X_NAs = UnivImput_NAs.transform(X_NAs)

elif ImputationType == "Multivariate":
    #Trying Multivariate Imputation
    if os.path.exists(f"{path_models}/MultivImput_NAs"):
        MultivImput_NAs = LoadVariable(f"{path_models}/MultivImput_NAs")
    else:
        MultivImput_NAs = IterativeImputer(random_state = RandomState, max_iter = 5, n_nearest_features = 200).fit(X_NAs)
        SaveVariable(MultivImput_NAs, f"{path_models}/MultivImput_NAs")
    X_NAs = MultivImput_NAs.transform(X_NAs)
####################


####################
### Handling NAs ###
NBeforeNADrop = len(XY)
XY = XY.dropna()
DroppedNARows = NBeforeNADrop - len(XY)
if DroppedNARows > 0: print("Dropped NA rows count:", DroppedNARows)

if CustomNAString is not None:
    NBeforeCustomNADrop = len(XY)
    XY = XY.replace(CustomNAString, np.nan, regex = False).dropna()
    DroppedCustomNARows = NBeforeCustomNADrop - len(XY)
    if DroppedCustomNARows > 0: print("Dropped custom NA rows count:", DroppedCustomNARows)
if DroppedNARows > 0 or (("DroppedCustomNARows" in locals() or "DroppedCustomNARows" in globals()) and DroppedCustomNARows > 0):
    print()
####################


##########################################
### Dummy Variables / One-Hot-Encoding ###
acqCountry_DF = pd.get_dummies(XY["acqCountry"].str.strip().str.lower(), prefix = 'acqCountry')
merchantCountryCode_DF = pd.get_dummies(XY["merchantCountryCode"].str.strip().str.lower(), prefix = 'merchantCountryCode')
merchantCategoryCode_DF = pd.get_dummies(XY["merchantCategoryCode"].str.strip().str.lower(), prefix = 'merchantCategoryCode')
transactionType_DF = pd.get_dummies(XY["transactionType"].str.strip().str.lower(), prefix = 'transactionType')
XY = pd.concat([XY.drop(["acqCountry", "merchantCountryCode", "merchantCategoryCode", "transactionType"], axis = 1), acqCountry_DF, merchantCountryCode_DF, merchantCategoryCode_DF, transactionType_DF], axis = 1)
##########################################


##########################################
### Keeping only the Variables we need ###
DependentVarName = "isFraud"
OtherDependentVarName = list(set([]) - set([DependentVarName]))
# accountNumber #Irrelevant
# customerId #Irrelevant
# transactionDateTime #Useless (by its own)
# merchantName #Too many levels
# currentExpDate #Useless (by its own)
# accountOpenDate #Useless (by its own)
# dateOfLastAddressChange #Useless (by its own)
# cardCVV #Irrelevant
# enteredCVV #Irrelevant
# cardLast4Digits #Irrelevant
OtherVariablesToBeDropped = ["accountNumber", "customerId", "transactionDateTime", "merchantName", "currentExpDate", "accountOpenDate", "dateOfLastAddressChange", "cardCVV", "enteredCVV", "cardLast4Digits"]
ColumnsToBeDropped = list(set([DependentVarName] + OtherDependentVarName + OtherVariablesToBeDropped))
ColumnsToBeDropped = [Col for Col in ColumnsToBeDropped if (Col in XY.columns.values)]
ColumnsToKeepX = XY.columns.values[~pd.Series(XY.columns.values).isin(ColumnsToBeDropped).values]
ColumnsToKeepY = XY.columns.values[XY.columns.values == DependentVarName]
##########################################


################################
### Getting a Train/Test set ###
X = XY.loc[:, ColumnsToKeepX].values.astype(np.float32)
Y = XY.loc[:, ColumnsToKeepY].values.astype(np.float32)
if len(Y.shape) == 2 and Y.shape[1] == 1: #This is univariate, so let's get a vector for Y instead of a matrix
    Y = Y.squeeze()

N = X.shape[0]

TrainPerc = 0.8
#==Stratified Split
init_seeds(Seed)
TrainIndx, TestIndx = train_test_split(np.arange(X.shape[0]), test_size = 1 - TrainPerc, shuffle = True, stratify = Y, random_state = Seed)
X_Train = X[TrainIndx]
Y_Train = Y[TrainIndx]
X_Test = X[TestIndx]
Y_Test = Y[TestIndx]
################################


########################
### Scaling the Data ###
if os.path.exists(f"{path_models}/scaler"):
    print("!!\n!! Using saved scaler.\n!!\n")
    scaler = LoadVariable(f"{path_models}/scaler")
else:
    scaler = StandardScaler(with_mean = True, with_std = True).fit(X_Train)
    SaveVariable(scaler, f"{path_models}/scaler")
    
X_Train = scaler.transform(X_Train)
X_Test = scaler.transform(X_Test)
########################


##########################################
### Taking care of the Class Imbalance ###
X_Train = np.concatenate((X_Train, np.tile(X_Train[Y_Train == 1], (20, 1))))
Y_Train = np.concatenate((Y_Train, np.tile(Y_Train[Y_Train == 1], (20))))
##########################################


###################################
### Creating Dataset/Dataloader ###
TrainDataset = TensorDataset(torch.from_numpy(X_Train), torch.from_numpy(Y_Train))
train_loader = torch.utils.data.DataLoader(
    dataset = TrainDataset,
    batch_size = batch_size,
    shuffle = True,
    pin_memory = False
)

TestDataset = TensorDataset(torch.from_numpy(X_Test), torch.from_numpy(Y_Test))
test_loader = torch.utils.data.DataLoader(
    dataset = TestDataset,
    batch_size = batch_size,
    shuffle = False,
    pin_memory = False
)
###################################

K, NTrain, D, H1, W1 = (1, 812794, 38, 0, 0)

print("\n{DependentVarName}")
sns.countplot(x = DependentVarName, data = XY)
plt.show()

XY

# Model

## Functional API

●	Each of the transactions in the dataset has a field called isFraud. Please build a predictive model to determine whether a given transaction will be fraudulent or not. Use as much of the data as you like (or all of it).

### Non Dynamic (Static)

In [None]:
#Static
class Net(nn.Module):
    def __init__(self, K, num_units, dropout, batchnorm_momentum):
        super(Net, self).__init__()
        
        self.layers = nn.ModuleList([
            nn.Linear(in_features=num_units[0], out_features=num_units[1], bias=usebias[0]),
            nn.Tanh(),
            nn.BatchNorm1d(num_units[1], eps=1e-05, momentum=batchnorm_momentum[0], affine=True, track_running_stats=True),
            nn.Dropout(p=dropout[0], inplace=False),
            nn.Linear(in_features=num_units[1], out_features=num_units[2], bias=usebias[1]),
            nn.Tanh(),
            nn.BatchNorm1d(num_units[2], eps=1e-05, momentum=batchnorm_momentum[1], affine=True, track_running_stats=True),
            nn.Dropout(p=dropout[1], inplace=False),
            nn.Linear(in_features=num_units[2], out_features=num_units[3], bias=usebias[2]),
            nn.Tanh(),
            nn.BatchNorm1d(num_units[3], eps=1e-05, momentum=batchnorm_momentum[2], affine=True, track_running_stats=True),
            nn.Dropout(p=dropout[2], inplace=False),
            nn.Linear(in_features=num_units[3], out_features=num_units[4], bias=usebias[3]),
            nn.Tanh(),
            nn.BatchNorm1d(num_units[4], eps=1e-05, momentum=batchnorm_momentum[3], affine=True, track_running_stats=True),
            nn.Dropout(p=dropout[3], inplace=False),
            nn.Linear(in_features=num_units[4], out_features=num_units[5], bias=usebias[4]),
            nn.Tanh(),
            nn.BatchNorm1d(num_units[5], eps=1e-05, momentum=batchnorm_momentum[4], affine=True, track_running_stats=True),
            nn.Dropout(p=dropout[4], inplace=False),
            nn.Linear(in_features=num_units[5], out_features=K, bias=usebias[5])
        ])
        
    def forward(self, x):
        out = self.layers[0](x)        
        out = self.layers[1](out)
        out = self.layers[2](out)
        out = self.layers[3](out)
        out = self.layers[4](out)
        out = self.layers[5](out)
        out = self.layers[6](out)
        out = self.layers[7](out)
        out = self.layers[8](out)
        out = self.layers[9](out)    
        out = self.layers[10](out)
        out = self.layers[11](out)
        out = self.layers[12](out)
        out = self.layers[13](out)
        out = self.layers[14](out)
        out = self.layers[15](out)
        out = self.layers[16](out)
        out = self.layers[17](out)
        out = self.layers[18](out)
        out = self.layers[19](out)
        out = self.layers[20](out)
        
        return out
print("Done")

## Hyper Parameters

In [None]:
conv_input_size = X_Train[0].shape if X_Train is not None else X[0].shape
input_size = np.prod(conv_input_size)
output_size = K
hn1 = D
print("conv_input_size: " + str(conv_input_size) + ", input_size: " + str(input_size) + ", D: " + str(D) + ", output_size: " + str(output_size))

ReluAlpha = 0.0 #0.01 def leakyRelu
EluAlpha = 0.8

layer_type = ['dense', 'dense', 'dense', 'dense', 'dense'] #"dense", "rnn", "gru", "lstm", "conv", "stridedconv", "convpool"
###
NUM = 1
num_units = [hn1, 50, 50, 10, 25, 25]
num_units = [num_units[0], *[n_unit * NUM for n_unit in num_units[1:]]]
###
activation = ["tanh"] + ["tanh"] * (len(layer_type)-1) #Avoid "relu" on RNN #None, "relu6" "relu", "elu", "softplus", "tanh", "sigmoid" #For RNNs (LSTM etc) tanh or relu only [check compatibility]
###
dropout    = [0.3] + [0.3] * (len(layer_type)-1) #Might be a bad idea on CNN as we're trying to find patterns
###
batchnorm = [True] * len(layer_type) #DOESNT WORK with Multistep-forecast (batch of 1) #Batchnorm already does regularization, so we usually don't need to add dropout as well
batchnorm_momentum = [0.1] * len(layer_type) #0.99 default Tensorflow, 0.1 Pytorch #used for the running_mean and running_var computation
###
usebias = [not batchnormlayer for batchnormlayer in batchnorm] + [True] #Length +1 because of the Output layer
#usebias = [True] * len(layer_type) + [True] #For CNN perhaps we want to have both BatchNorm + a Bias
###
l2_lamda = 0.005
mu = 0.8 #Momentum
print()
print("num_units", num_units)

## Optimization

### Structure

In [None]:
model = Net(K, num_units, dropout, batchnorm_momentum).to(device)
print(device)
print(model)

train_losses = np.array([])
test_losses = np.array([])
train_best_loss = np.Inf
test_best_loss = np.Inf
Metric1 = 0 #Initialising as the worst possible value
Metric2 = np.nan
Metric3 = np.nan

In [None]:
summary(model, input_size = tuple([2, *conv_input_size]), device = device, verbose = 2, col_names = ["kernel_size", "output_size", "num_params", "mult_adds"])
pass

In [None]:
criterion = nn.BCEWithLogitsLoss() #Using Binary Cross Entropy loss function
print("Binary Classification")

learning_rate = 1e-3

optimiser = torch.optim.AdamW(model.parameters(), lr = learning_rate, betas = (mu, 0.999), weight_decay = l2_lamda, amsgrad = False) #LR=0.001 (0.9, 0.999)

### Prerequisite Functions

In [None]:
def AUCCalculation(Targets, Y_Prob, Y_Hat, K):
    if isinstance(Targets, torch.Tensor):
        Targets = Targets.cpu().numpy()
        
    if isinstance(Y_Prob, torch.Tensor):
        Y_Prob = Y_Prob.cpu().numpy()
        
    if isinstance(Y_Hat, torch.Tensor):
        Y_Hat = Y_Hat.cpu().numpy()
    
    if K == 1:
        try:
            CurMetric2 = roc_auc_score(Targets, Y_Prob) #Calculating AUC #Cares for performance both in Positives and Negatives (but may not fare well with heavy class imbalance)
        except Exception as exc:
            CurMetric2 = np.nan
            warnings.warn(f"\nAn error occurred in AUC calculation (probably because the random batch of data includes only 1 of the 2 classes?).\nThe error reads: {exc}")                    
            print("set(Targets): ", list(set(Targets.reshape(-1))), "set(Outputs): ", list(set(Y_Hat.reshape(-1))))
    else:
        try:
            CurMetric2 = roc_auc_score(Targets, Y_Prob, multi_class = "ovr", average = 'weighted') #Calculating Weighted AUC #Cares for performance both in Positives and Negatives (but may not fare well with heavy class imbalance)
        except Exception as exc:
            CurMetric2 = np.nan
            warnings.warn(f"\nAn error occurred in AUC calculation (probably because of missing classes in the random batch of data?).\nThe error reads: {exc}")
            print("set(Targets): ", list(set(Targets.reshape(-1))), "set(Outputs): ", list(set(Y_Hat.reshape(-1))))
            
    return CurMetric2

In [None]:
def F1ScoreCalculation(Targets, Y_Hat, K):
    if K == 1:
        try:
            CurMetric3 = f1_score(Targets.cpu().numpy(), Y_Hat.cpu().numpy()) #Calculating F1 #Cares about balance between Precision and Recall (Sensitivity)
        except Exception as exc:
            CurMetric3 = np.nan
            warnings.warn(f"\nAn error occurred in F1 score calculation (probably because the random batch of data includes only 1 of the 2 classes?).\nThe error reads: {exc}")

    else:
        try:
            CurMetric3 = f1_score(Targets.cpu().numpy(), Y_Hat.cpu().numpy(), average = 'weighted') #Calculating Weighted F1 #Cares about balance between Precision and Recall (Sensitivity)
        except Exception as exc:
            CurMetric3 = np.nan
            warnings.warn(f"\nAn error occurred in F1 score calculation (probably because of missing classes in the random batch of data?).\nThe error reads: {exc}")
            
    return CurMetric3

In [None]:
def PrintIterationMetrics(it, epochs, t0, train_loss, test_loss, first_metric, first_metric_Name, second_metric, second_metric_Name, third_metric, third_metric_Name, MaxTrainLossLen, MaxTestLossLen, MaxMetric1Len, MaxMetric2Len, MaxMetric3Len):
    dt = datetime.now() - t0

    strTrainLoss = f"{train_loss:.4f}"
    strTestLoss = f"{test_loss:.4f}"
    strMetric1 = f'{first_metric:.3f}'
    strMetric2 = f'{second_metric:.3f}'
    strMetric3 = f'{third_metric:.3f}'
    if it == 0:
        MaxTrainLossLen = len(strTrainLoss)
        MaxTestLossLen = len(strTestLoss)
        MaxMetric1Len = len(strMetric1)
        MaxMetric2Len = len(strMetric2)
        MaxMetric3Len = len(strMetric3)
    print(f'Epoch {ZeroANumber(it+1, len(str(epochs)))}/{epochs}, Train Loss: {SpaceAString(strTrainLoss, MaxTrainLossLen)}, Test Loss: {SpaceAString(strTestLoss, MaxTestLossLen)}, {first_metric_Name}: {SpaceAString(strMetric1, MaxMetric1Len)}, {second_metric_Name}: {SpaceAString(strMetric2, MaxMetric1Len)}, {third_metric_Name}: {SpaceAString(strMetric3, MaxMetric1Len)}, Duration: {dt}')
    return MaxTrainLossLen, MaxTestLossLen, MaxMetric1Len, MaxMetric2Len, MaxMetric3Len

In [None]:
def UpdateMetricsAndSaveModel(model, train_loss, test_loss, train_best_loss, test_best_loss, CurMetric1, Metric1, CurMetric2, Metric2, CurMetric3, Metric3):
    if (test_loss < test_best_loss): #Saving the model if it outperforms previous iteration's model
        test_best_loss = test_loss
        train_best_loss = train_loss
        torch.save(model.state_dict(), f"model_dict.pt") #Saving Model's Dictionary
        
        if np.isfinite(CurMetric1) and CurMetric1 >= Metric1:
            Metric1 = CurMetric1
            Metric2 = CurMetric2
            Metric3 = CurMetric3
            torch.save(model.state_dict(), f"acc_model_dict.pt") #Saving Model's Dictionary
    return train_best_loss, test_best_loss, Metric1, Metric2, Metric3

In [None]:
def PrintFinishingInformation(start_time, JustCalculateElapsedTime = False):
    elapsed_time = time.time() - start_time
    if not JustCalculateElapsedTime:
        FinishedOn = datetime.now(timezone('Europe/Athens')).strftime("%a, %Y-%m-%d %H:%M %Z %z")
        print("\nDone (" + FinishedOn + ") Elapsed time: " + str(round(elapsed_time, 1)) + " seconds")
    
    return elapsed_time

In [None]:
def TrainModel(model, optimiser, criterion, X_Train, Y_Train, K):
    model.train() #Putting model in training mode so that things like dropout() are activated again
    
    optimiser.zero_grad() #Initialisation of the gradient of θ
    outputs = model(X_Train) #Getting the prediction using the forward direction of the Neural Net

    if K == 1:
        outputs = outputs.view(-1) #Target is a vector, so let's ensure predictions are a vector too
    
    loss = criterion(outputs, Y_Train) #Calculating the loss according to the loss function
    loss.backward() #Calculating the Gradient Δθ of the loss function with respect to the parameters
    
    optimiser.step() #Calculates and updates the parameters θ using gradient descent, as θ = θ - η*Δθ
    
    return optimiser, outputs, loss

In [None]:
def EvaluateModel(model, criterion, Inputs, Targets, K, ScalerToInverseTransform = None, numPredictors = len(ColumnsToKeepY)):
    model.eval() #Putting model in evaluation mode so that things like dropout() are deactivated
    with torch.no_grad(): #Making sure that we don't update the gradient outside the training part
        Y_Prob = model(Inputs) #Getting the prediction using the forward direction of the Neural Net
        
        if K == 1:
            Y_Prob = Y_Prob.view(-1) #Target is a vector, so let's ensure predictions are a vector too

        loss_scalar = criterion(Y_Prob, Targets).item() #Calculating the loss according to the loss function

        if K == 1:
            Y_Prob = nn.Sigmoid()(Y_Prob) #The loss function includes the sigmoid so we need to use it here to get the probabilities
            Y_Hat = (Y_Prob >= 0.5) #Y_Hat is whether or not the probability is greater than the threshold            
        else:
            Y_Prob = nn.Softmax(dim = 1)(Y_Prob) #dim: every slice along dim will sum to 1
            _, Y_Hat = torch.max(Y_Prob, 1) #Prediction. torch.max returns both max (value) and argmax (index)

        CurMetric1 = (Y_Hat == Targets).float().mean().cpu().numpy().squeeze() #Calculating Accuracy
        CurMetric2 = AUCCalculation(Targets, Y_Prob, Y_Hat, K)
        CurMetric3 = F1ScoreCalculation(Targets, Y_Hat, K)
            
        return Y_Prob, Y_Hat, loss_scalar, CurMetric1, CurMetric2, CurMetric3

In [None]:
def FixFormatAndDTypes(device, H1, W1, D, K, Inputs, Targets):
    if isinstance(Inputs, np.ndarray):
        Inputs = torch.from_numpy(Inputs)
    if isinstance(Targets, np.ndarray):
        Targets = torch.from_numpy(Targets)
    
    Inputs = Inputs.to(device)
    Targets = Targets.to(device)
    
    Inputs = Inputs.float()
    Targets = Targets.float() #For Binary Classification: Even though it technically is an integer, float calculations will take place, so making it a float
    
    return Inputs, Targets

### Stochastic Gradient Descent (Dataset)

#### Function

In [None]:
def batch_gd(model, criterion, optimiser, scheduler, train_loader, test_loader, epochs, PrintInfoEverynEpochs):
    global train_best_loss #Assigning to globals()
    global test_best_loss #Assigning to globals()
    global Metric1 #Assigning to globals() #Accuracy for Classification or R2 for Regression
    global Metric2 #Assigning to globals() #AUC for Classification or Adj.R2 for Regression
    global Metric3 #Assigning to globals() #F1 Score for Classification or MAE for Regression
    MaxTrainLossLen, MaxTestLossLen, MaxMetric1Len, MaxMetric2Len, MaxMetric3Len = None, None, None, None, None #For output text formatting
    
    start_time = time.time() #To calculate the duration of the whole learning procedure
    model = model.to(device) #If there is a GPU, let's ensure model is sent to the GPU
    
    train_losses = np.zeros(epochs) #Initialising the losses
    test_losses = np.zeros(epochs)  #Initialising the losses
    for it in range(epochs):
        t0 = datetime.now() #To calculate the duration of the current epoch
        train_loss = [] #Initialising the loss for current epoch
        train_weights = []
        
        #== Training ==#
        #for inputs, targets in train_loader:
        for inputs, targets in tqdm(train_loader, total = len(train_loader), leave = False):
            inputs, targets = FixFormatAndDTypes(device, H1, W1, D, K, inputs, targets) #Making sure we have Tensors of the correct Format and Data Type
            optimiser, outputs, loss = TrainModel(model, optimiser, criterion, inputs, targets, K) #Training the model on Train set
            train_loss.append(loss.item())
            train_weights.append(targets.shape[0])
        train_loss = np.average(train_loss, weights = train_weights) #Weighted average based on number of samples #Alternatively we could add all Iteration Losses instead of 1 Epoch loss, but the array length will be higher than Validation's
        
        #== Evaluation ==#
        test_loss = []
        test_metric1 = []
        test_metric2 = []
        test_metric3 = []
        test_weights = []
        #for inputs, targets in test_loader:
        for inputs, targets in tqdm(test_loader, total = len(test_loader), leave = False):
            inputs, targets = FixFormatAndDTypes(device, H1, W1, D, K, inputs, targets) #Making sure we have Tensors of the correct Format and Data Type
            _, _, cur_test_loss, cur_test_metric1, cur_test_metric2, cur_test_metric3 = EvaluateModel(model, criterion, inputs, targets, K) #Evaluating the model on Evaluation set
            test_loss.append(cur_test_loss)
            test_metric1.append(cur_test_metric1)
            test_metric2.append(cur_test_metric2)
            test_metric3.append(cur_test_metric3)
            test_weights.append(targets.shape[0])
        test_loss = np.average(test_loss, weights = test_weights) #Weighted average based on number of samples
        CurMetric1 = np.average(test_metric1, weights = test_weights) #Weighted average based on number of samples
        CurMetric2 = np.average(test_metric2, weights = test_weights) #Weighted average based on number of samples
        CurMetric3 = np.average(test_metric3, weights = test_weights) #Weighted average based on number of samples

        if scheduler is not None:
            scheduler.step()
        
        train_losses[it] = train_loss
        test_losses[it] = test_loss
        
        if (it + 1) % PrintInfoEverynEpochs == 0 or it == 0: #Printing information about the Loss and metric
            MaxTrainLossLen, MaxTestLossLen, MaxMetric1Len, MaxMetric2Len, MaxMetric3Len = PrintIterationMetrics( #Prints Iteration Metrics
                it, epochs, t0, train_loss, test_loss,
                CurMetric1, "Acc",
                CurMetric2, "AUC",
                CurMetric3, "F1",
                MaxTrainLossLen, MaxTestLossLen,
                MaxMetric1Len, MaxMetric2Len, MaxMetric3Len
            )
        
        train_best_loss, test_best_loss, Metric1, Metric2, Metric3 = UpdateMetricsAndSaveModel(model, train_loss, test_loss, train_best_loss, test_best_loss, CurMetric1, Metric1, CurMetric2, Metric2, CurMetric3, Metric3) #Updating Metrics and Saving the model if it outperforms previous iteration's model
    
    elapsed_time = PrintFinishingInformation(start_time) #Prints finishing information
    return train_losses, test_losses, train_best_loss, test_best_loss, Metric1, Metric2, Metric3, elapsed_time

#### Procedure

In [None]:
GDType = "stochastic"
Epochs = 10
PrintInfoEverynEpochs = 1

scheduler = None
# scheduler = torch.optim.lr_scheduler.StepLR(optimiser, step_size = Epochs // 10 if Epochs > 10 else 3, gamma = 0.8)

new_train_losses, new_test_losses, train_best_loss, test_best_loss, Metric1, Metric2, Metric3, elapsed_time = batch_gd(model, criterion, optimiser, scheduler, train_loader, test_loader, epochs = Epochs, PrintInfoEverynEpochs = PrintInfoEverynEpochs)
train_losses = np.append(train_losses, new_train_losses)
test_losses = np.append(test_losses, new_test_losses)
train_loss = train_losses[-1]
test_loss = test_losses[-1]
print("\ntrain_best_loss:", train_best_loss, "test_best_loss:", test_best_loss, "Acc:", Metric1, "AUC:", Metric2, "F1:", Metric3)

In [None]:
model.load_state_dict(torch.load("model_dict.pt"))
model.eval()
pass

In [None]:
train_losses = np.array([])
test_losses = np.array([])

# Evaluation

## Binary Classification

In [None]:
plt.plot(train_losses, label = 'train loss')
plt.plot(test_losses, label = 'test loss')
plt.legend()
plt.show()

●	Provide an estimate of performance using an appropriate sample, and show your work.

In [None]:
model.eval()
with torch.no_grad():
    Actual_Y = np.array([])
    Y_Prob = np.array([])
    Y_Hat = np.array([])

    Weights = []
    train_loss = []
    train_Acc = []
    train_AUC = []
    train_F1s = []

    print("Stochastic GD")
    for inputs, targets in tqdm(train_loader, total = len(train_loader), leave = False):
        inputs, targets = FixFormatAndDTypes(device, H1, W1, D, K, inputs, targets)
        Actual_Y = np.append(Actual_Y, targets.cpu().numpy(), axis = 0)

        cur_Y_Prob, cur_Y_Hat, cur_train_loss, cur_train_Acc, cur_train_AUC, cur_train_F1 = EvaluateModel(model, criterion, inputs, targets, K)
        Y_Prob = np.append(Y_Prob, cur_Y_Prob.cpu().numpy(), axis = 0) #Prediction probability.
        Y_Hat = np.append(Y_Hat, cur_Y_Hat.cpu().numpy(), axis = 0) #Prediction.

        Weights.append(cur_Y_Prob.shape[0])
        train_loss.append(cur_train_loss)
        train_Acc.append(cur_train_Acc)
        train_AUC.append(cur_train_AUC)
        train_F1s.append(cur_train_F1)

    train_loss = np.average(train_loss, weights = Weights)
    train_Acc = np.average(train_Acc, weights = Weights)
    train_AUC = np.average(train_AUC, weights = Weights)
    train_F1s = np.average(train_F1s, weights = Weights)

    Test_Actual_Y = np.array([])
    Pred = np.array([])
    test_loss = []
    for inputs, targets in tqdm(test_loader, total = len(test_loader), leave = False):
        inputs, targets = FixFormatAndDTypes(device, H1, W1, D, K, inputs, targets)

        cur_Y_Prob, cur_Y_Hat, cur_train_loss, cur_train_Acc, cur_train_AUC, cur_train_F1 = EvaluateModel(model, criterion, inputs, targets, K)

        Test_Actual_Y = np.append(Test_Actual_Y, targets.cpu().numpy(), axis = 0)
        outputs = model(inputs).view(-1,) #Getting the prediction using the forward direction of the Neural Net
        CurLoss = criterion(outputs, targets) #Calculating the loss according to the loss function
        test_loss.append(CurLoss.item())
        outputs = outputs.detach().cpu().numpy().squeeze()
        outputs = nn.Sigmoid()(torch.from_numpy(outputs)).numpy()
        Pred = np.append(Pred, outputs, axis = 0) #Prediction.

    test_loss = np.mean(test_loss)
    test_Acc = np.mean((Pred >= 0.5) == Test_Actual_Y)
    test_AUC = roc_auc_score(Test_Actual_Y, Pred)
    test_F1 = f1_score(Test_Actual_Y, np.round(Pred, 0))

    print(f"Train loss: {train_loss:.3f}. Acc: {(train_Acc * 100.):.2f}%. AUC: {train_AUC:.3f}. F1: {train_F1s:.3f}")
    print(f"Test  loss: {test_loss:.3f}. Acc: {(test_Acc * 100.):.2f}%. AUC: {test_AUC:.3f}. F1: {test_F1:.3f}")
    print()

    #Confusion Matrix
    cm = confusion_matrix(Test_Actual_Y, Pred >= 0.5) #This should not use Y_Test on Stochastic
    plot_confusion_matrix(cm, [False, True])

●	Please explain your methodology (modeling algorithm/method used and why, what features/data you found useful, what questions you have, and what you would do next with more time)

In [None]:
#The first step was to transform the data from an unstructured JSON format into a structured tabular format that Machine Learning algorithms can understand.
#
#Preprocessing included things like transforming string fields representing datetimes into acutal datetime fields, replacing missing values encoded as emptry strings with actual missing values (NaNs), and so on.
#There is code for univariate and multivariate imputation, as well as replacing with 0s, controlled by the value of the variable "ImputationType". In this iteration no imputation was used, however and rows with missing values were removed since there is a vast amount of data.
#Categorical variables were one-hot-encoded, and these variables were removed since they were completely empty: echoBuffer, merchantCity, merchantState, merchantZip, posOnPremises, recurringAuthInd.
#
#merchantName simply has too many levels to be of any use, so it's been discarded.
#accountNumber, customerId, cardCVV, enteredCVV and cardLast4Digits are irrelevant and they change by person (many times they even change by purchase even for the same person), so they too were dropped.
#transactionDateTime, currentExpDate, accountOpenDate and dateOfLastAddressChange are dates and are useless for machine learning. At least in their current form. They can be used to feature engineering, however. For instance, engineering a feature of seconds passed since last purchase.
#The rest of the variables were used on the model
#
#The Training/Testing set splitting was done in a stratified manner on account of the significant class-imbalance of the dataset. There is a 80% training set and 20% testing set.
#On the training set, there is also heavy oversampling of the minority class, so as to aid the training part to recognise it better.
#The dataset is also normalised before passed to the model, transformed into a PyTorch dataframe and subsequently a data loader is created to yield batches of it in a random fashion.
#
#The model architecture is an instance of a Deep Feed-Forward Neural Network with Tanh activation functions, dropout regularization to avoid overfitting, batch-normalization which also aids in regularisation as well,
#    and 1 neuron with a sigmoid activation at the end to produce the probability of the class being the positive one. It should be noted, in PyTorch, the sigmoid is part of the loss function, which is why it's not included in the code.
#The loss function is the Binary Cross Entropy, and the optimiser is the AdamW variant of Stochastic Gradient Descent. There is also an L2 regularization to help with overfitting.
#
#The cost is plotted after training so we can observe its behaviour and tweak hyperparameters accordingly, and the evaluation includes the Accuracy (which is somewhat meaningless for class-imbalanced datasets), the Area Under the ROC Curve, and the F1 score.
#There is also a confusion matrix which shows the exact number of True Positives/Negative and False Positives/Negatives.
#
#Lastly, there is a section for Saving and Loading the trained model, as well as visualising it.

# Saving the Model

In [None]:
print("path_root:", path_root, "\n")

SaveFolder = (f"{path_root}/Models/" +
              datetime.now(timezone('Europe/Athens')).strftime("%Y-%m-%d %H-%M") + ", " +
              "loss " + "{0:.2f}".format(test_loss) + ", " +
              "Acc " + "{0:.2f}".format(Metric1) + ", " +
              "AUC " + "{0:.2f}".format(Metric2)
)
print(SaveFolder)
os.makedirs(SaveFolder, exist_ok = True)

In [None]:
### Saving the Model ###
torch.save(model.state_dict(), SaveFolder + "/model_dict.pt")

# Loading the Model

In [None]:
SaveFolder = fr"{path_root}\Models\2022-4-22 12-57, loss 0.26, Acc 0.92, AUC 0.78"
print(f"Using explicitly defined SaveFolder = {SaveFolder}")

model = Net(K, num_units, dropout, batchnorm_momentum).to(device)
#model.load_state_dict(torch.load(SaveFolder + "/model_dict.pt"))
model.eval()
pass

# Visualization

In [None]:
#conda install graphviz python-graphviz
!pip install hiddenlayer
import hiddenlayer as hl
if "TrainDataset" in locals() or "TrainDataset" in globals():
    hl_graph = hl.build_graph(model, torch.zeros(list([2, *conv_input_size]), device = device), )
else:
    hl_graph = hl.build_graph(model, torch.zeros(list(conv_input_size), device = device), )
hl_graph.theme = hl.graph.THEMES["blue"].copy()
display(hl_graph)

In [None]:
!pip install torchviz
from torchviz import make_dot
if "TrainDataset" in locals() or "TrainDataset" in globals():
    display(make_dot(model(next(iter(TrainDataset))[0][np.newaxis].float().to(device)), params = dict(list(model.named_parameters())), show_attrs = False, show_saved = False)) #.render("NN", format = "png")
else:
    display(make_dot(model(torch.from_numpy(X_Test).to(device)), params = dict(list(model.named_parameters())), show_attrs = False, show_saved = False)) #.render("NN", format = "png")