In [1]:
# https://arxiv.org/pdf/2010.06479.pdf
# Credit card fraud detection using machine learning: A survey

# https://arxiv.org/pdf/2001.08922.pdf

In [2]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import pandas as pd
import numpy as np

from tqdm import tqdm
from multiprocessing import Pool

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import roc_auc_score, f1_score, roc_curve, auc, confusion_matrix, precision_recall_curve

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import LSTM, BatchNormalization, Dense, Dropout
from tensorflow.keras import Model, Input, Sequential

In [3]:
RANDOM_STATE = 35

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [4]:
def filter_accounts(account_id):

    filtered_account = data_sorted[data_sorted[:,0]==account_id]
    
    sample_size = filtered_account.shape[0]
    
    value_to_pad = max_sample_size - sample_size
    
    filtered_account = np.pad(filtered_account, ((0,value_to_pad),(0,0)))   #padding after
    
    filtered_account = filtered_account.astype(np.float16)  

    return filtered_account

In [5]:
def group_by_time_sort(df, group_column="account_id", debug=False, parallelism=True):
    
    global data_sorted
    global max_sample_size
    
    unique_accounts = df[group_column].unique()
    len_unique_accounts = len(unique_accounts)
    max_sample_size = max(df.groupby(group_column)[group_column].count())
    
    account_index, processed_index = [df.columns.get_loc("account_id"), df.columns.get_loc("processed_at")]

    data_array = np.array(df, dtype=np.float32)
    data_sorted = data_array[np.lexsort((data_array[:,4],data_array[:,0]))]
        
    p = Pool(4, maxtasksperchild=1000)
        
    result = tqdm(p.imap(filter_accounts, unique_accounts), total=len_unique_accounts)
    
    grouped_sorted = np.array(list(result), dtype=np.float16)
    
    p.close()
    p.join()
    
    grouped_sorted = np.delete(grouped_sorted, (processed_index, account_index), axis=2)
    
    del data_sorted
              
    return grouped_sorted

In [6]:
# we cannot intefer with the time sequence, only with the sample

def split_train_test(data_array, test_size):
    
    np.random.shuffle(data_array)
    size_sample = data_array.shape[0]
    
    train_size = int(size_sample*(1-test_size))
    
    train = data_array[:train_size]
    test = data_array[train_size:]
    
    return train, test

In [7]:
data_ts = pd.read_csv("dataset/processed_features.csv")


In [8]:
data_ts.head()

Unnamed: 0,account_id,device_id,balance,processed_at,is_fraud,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
0,0,0,0.000167,1542826000.0,0,0.0,5.0,878346.3,1,0,0,0
1,1,0,0.000533,1540263000.0,0,1.0,5.0,9689721.0,0,0,1,0
2,1,0,0.000347,1542655000.0,0,1.0,5.0,12081820.0,0,0,1,0
3,1,0,0.000572,1540837000.0,0,1.0,5.0,10264350.0,0,0,1,0
4,1,0,0.000346,1540998000.0,0,1.0,5.0,10425470.0,0,0,1,0


In [9]:
# seconds to days

data_ts.time_client = data_ts.time_client / (60*60*24)

data_ts.head()

Unnamed: 0,account_id,device_id,balance,processed_at,is_fraud,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
0,0,0,0.000167,1542826000.0,0,0.0,5.0,10.166046,1,0,0,0
1,1,0,0.000533,1540263000.0,0,1.0,5.0,112.149552,0,0,1,0
2,1,0,0.000347,1542655000.0,0,1.0,5.0,139.835918,0,0,1,0
3,1,0,0.000572,1540837000.0,0,1.0,5.0,118.800374,0,0,1,0
4,1,0,0.000346,1540998000.0,0,1.0,5.0,120.665177,0,0,1,0


In [10]:
columns=['account_id', 'is_fraud', 'device_id', 'balance', 'processed_at',
       'age_range', 'number_of_selfies_sent', 'time_client', 'cash_out_type_1',
       'cash_out_type_2', 'cash_out_type_3', 'cash_out_type_6']

data_ts = data_ts[columns]

data_ts.head()

Unnamed: 0,account_id,is_fraud,device_id,balance,processed_at,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
0,0,0,0,0.000167,1542826000.0,0.0,5.0,10.166046,1,0,0,0
1,1,0,0,0.000533,1540263000.0,1.0,5.0,112.149552,0,0,1,0
2,1,0,0,0.000347,1542655000.0,1.0,5.0,139.835918,0,0,1,0
3,1,0,0,0.000572,1540837000.0,1.0,5.0,118.800374,0,0,1,0
4,1,0,0,0.000346,1540998000.0,1.0,5.0,120.665177,0,0,1,0


In [11]:
data_ts.dtypes

account_id                  int64
is_fraud                    int64
device_id                   int64
balance                   float64
processed_at              float64
age_range                 float64
number_of_selfies_sent    float64
time_client               float64
cash_out_type_1             int64
cash_out_type_2             int64
cash_out_type_3             int64
cash_out_type_6             int64
dtype: object

In [12]:
data_ts[data_ts.device_id >= 60000]["device_id"].count()

41702

In [13]:
# np.float16 is in the range 0-65535 before turn to this dtype we have to see 
# if any os non binary columns go beyond this range

for column in data_ts.columns:
    
    max_columns_value = data_ts[column].max()
    
    if max_columns_value > 60000:
        
        print(f"Column {column} have the max value equals to {max_columns_value}")

Column account_id have the max value equals to 63722
Column device_id have the max value equals to 74927
Column processed_at have the max value equals to 1543114734.0


In [14]:
# the columns that pass the float16 limite, we gonna take them and turn and something
# that we can work using min max scaler
data_to_scale = data_ts[["account_id","device_id", "processed_at"]]
columns = data_to_scale.columns

scaler = MinMaxScaler()

columns_transformed = pd.DataFrame(scaler.fit_transform(data_to_scale), columns=columns)


In [15]:
display(columns_transformed.head())


Unnamed: 0,account_id,device_id,processed_at
0,0.0,0.0,0.911958
1,1.6e-05,0.0,0.131276
2,1.6e-05,0.0,0.859885
3,1.6e-05,0.0,0.306302
4,1.6e-05,0.0,0.355377


In [16]:
columns_transformed_back = pd.DataFrame(scaler.inverse_transform(columns_transformed), columns=columns)

display(columns_transformed_back.head())


Unnamed: 0,account_id,device_id,processed_at
0,0.0,0.0,1542826000.0
1,1.0,0.0,1540263000.0
2,1.0,0.0,1542655000.0
3,1.0,0.0,1540837000.0
4,1.0,0.0,1540998000.0


In [17]:
data_ts[["account_id","device_id", "processed_at"]] = columns_transformed_back

for column in data_ts.columns:

    max_columns_value = data_ts[column].max()

    if max_columns_value > 60000:

        print(f"Column {column} have the max value equals to {max_columns_value}")

Column account_id have the max value equals to 63722.0
Column device_id have the max value equals to 74927.0
Column processed_at have the max value equals to 1543114734.0


In [18]:
data_ts[["account_id","device_id", "processed_at"]] = columns_transformed

data_ts.head()

Unnamed: 0,account_id,is_fraud,device_id,balance,processed_at,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
0,0.0,0,0.0,0.000167,0.911958,0.0,5.0,10.166046,1,0,0,0
1,1.6e-05,0,0.0,0.000533,0.131276,1.0,5.0,112.149552,0,0,1,0
2,1.6e-05,0,0.0,0.000347,0.859885,1.0,5.0,139.835918,0,0,1,0
3,1.6e-05,0,0.0,0.000572,0.306302,1.0,5.0,118.800374,0,0,1,0
4,1.6e-05,0,0.0,0.000346,0.355377,1.0,5.0,120.665177,0,0,1,0


In [19]:
data_ts.tail()

Unnamed: 0,account_id,is_fraud,device_id,balance,processed_at,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
940930,0.999937,0,0.999947,1.328266e-05,0.484146,2.0,6.0,250.80859,0,1,0,0
940931,0.999953,0,0.99996,1.657759e-06,0.405173,2.0,10.0,219.574801,0,1,0,0
940932,0.999969,0,0.999973,6.087884e-07,0.960566,2.0,6.0,99.834953,0,1,0,0
940933,0.999984,0,0.999987,5.089921e-05,0.938725,0.0,5.0,65.747836,0,1,0,0
940934,1.0,0,1.0,7.265997e-05,0.915419,2.0,6.0,25.968988,0,1,0,0


In [20]:
data_ts[data_ts.is_fraud == 1].head()

Unnamed: 0,account_id,is_fraud,device_id,balance,processed_at,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
1752,0.000879,1,2.7e-05,0.000152,0.695656,3.0,5.0,514.590262,0,0,1,0
1958,0.001161,1,2.7e-05,0.000117,0.017906,0.0,5.0,308.839157,0,1,0,0
2837,0.00193,1,2.7e-05,1.6e-05,0.30191,2.0,6.0,146.375151,0,1,0,0
3480,0.003092,1,2.7e-05,7e-06,0.120927,2.0,6.0,135.934172,0,1,0,0
3481,0.003092,1,2.7e-05,2.3e-05,0.119085,2.0,6.0,135.864195,0,1,0,0


In [21]:
for column in data_ts.columns:

    max_columns_value = data_ts[column].max()

    if max_columns_value > 60000:

        print(f"Column {column} have the max value equals to {max_columns_value}")


In [22]:
def split_indices_stritify(df, proportion, rate_positive):
 
    """
    Args:
    
    proportion - is the proportion of the original data we want to slice
    rate_positive - rate of the positive label of the imbaleced data
    
    """
    array_size = len(df)
    
    new_size = proportion*array_size
    
    positive_size = int(rate_positive * new_size)
    negative_size = int((1 - rate_positive) * new_size)
    
    positive_indicies = df.index[df["is_fraud"] == 1].tolist()[:positive_size]
    negative_indices = df.index[df["is_fraud"] == 0].tolist()[:negative_size]
    
    return positive_indicies, negative_indices

In [23]:
size_pos = len(data_ts[data_ts.is_fraud==1])
size_neg = len(data_ts[data_ts.is_fraud==0])

post_proportion =  size_pos/ (size_neg + size_pos)

post_proportion

0.0016313560447852402

In [24]:
positive_indicies, negative_indices = split_indices_stritify(data_ts, 0.2, post_proportion)

In [25]:
data_test = pd.concat([data_ts.iloc[negative_indices],
                       data_ts.iloc[positive_indicies]])

size_pos_new = len(data_test[data_test.is_fraud==1])
size_neg_new = len(data_test[data_test.is_fraud==0])

post_proportion_new =  size_pos_new/ (size_neg_new + size_pos_new)
neg_proportion_new = size_neg_new/ (size_neg_new + size_pos_new)

print("size", len(data_test), "postive rate", post_proportion_new)
data_test.head()

size 188187 postive rate 0.0016313560447852402


Unnamed: 0,account_id,is_fraud,device_id,balance,processed_at,age_range,number_of_selfies_sent,time_client,cash_out_type_1,cash_out_type_2,cash_out_type_3,cash_out_type_6
0,0.0,0,0.0,0.000167,0.911958,0.0,5.0,10.166046,1,0,0,0
1,1.6e-05,0,0.0,0.000533,0.131276,1.0,5.0,112.149552,0,0,1,0
2,1.6e-05,0,0.0,0.000347,0.859885,1.0,5.0,139.835918,0,0,1,0
3,1.6e-05,0,0.0,0.000572,0.306302,1.0,5.0,118.800374,0,0,1,0
4,1.6e-05,0,0.0,0.000346,0.355377,1.0,5.0,120.665177,0,0,1,0


In [26]:
data_array = group_by_time_sort(data_test, "account_id")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8431/8431 [00:11<00:00, 708.61it/s]


In [27]:
train, test = split_train_test(data_array, 0.2)
train, val = split_train_test(train, 0.2)

train_labels = train[:,:,0]
val_labels = val[:,:,0]
test_labels = test[:,:,0]

train_features = np.delete(train, 0, axis=2)
val_features = np.delete(val, 0, axis=2)
test_features = np.delete(test, 0, axis=2)

print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (5395, 8427)
Validation labels shape: (1349, 8427)
Test labels shape: (1687, 8427)
Training features shape: (5395, 8427, 9)
Validation features shape: (1349, 8427, 9)
Test features shape: (1687, 8427, 9)


In [35]:
train_positive = len(train_labels[train_labels==1])
train_negative = len(train_labels[train_labels==0])

train_proportion = train_positive / (train_positive + train_negative)

val_positive = len(val_labels[val_labels==1])
val_negative = len(val_labels[val_labels==0])

val_proportion = val_positive / (val_positive + val_negative)

test_positive = len(test_labels[test_labels==1])
test_negative = len(test_labels[test_labels==0])

test_proportion = test_positive / (test_positive + test_negative)

print("Train proportion", train_proportion)
print("Test proportion", test_proportion)
print("Val proportion", val_proportion)

Train proportion 4.3331306439988945e-06
Test proportion 5.134933026756729e-06
Val proportion 3.254743590860082e-06


In [36]:
metrics = [tf.keras.metrics.BinaryAccuracy(name="ba"),
           tf.keras.metrics.FalsePositives(name="fp"),
           tf.keras.metrics.FalseNegative(name="fn"),
           tf.keras.metrics.AUC(name="ROCauc"),
           tf.keras.metrics.AUC(name="PRcurve", curve='PR')]

In [46]:
weigth_1 = (1/size_neg_new) * (size_pos_new+size_neg_new)/2
weigth_0 = (1/size_pos_new) * (size_pos_new+size_neg_new)/2

train_size = train_labels.shape[0]
val_size = val_labels.shape[0]

batch_size = 1024
epochs = 100
class_weight = {0:weigth_0, 1: weigth_1}
steps_per_epoch = None
validation_batch_size = None
validation_steps = None
workers = 0
use_multiprocessing=False


print("Class weigth:", class_weight)

Class weigth: {0: 306.49348534201954, 1: 0.5008170108579945}


In [47]:
def weighted_binary_cross_entropy(weights: dict, from_logits: bool = False):
    
    assert 0 in weights
    assert 1 in weights

    def weighted_cross_entropy_fn(y_true, y_pred):
        tf_y_true = tf.cast(y_true, dtype=y_pred.dtype)
        tf_y_pred = tf.cast(y_pred, dtype=y_pred.dtype)

        weights_v = tf.where(tf.equal(tf_y_true, 1), weights[1], weights[0])
        ce = K.binary_crossentropy(tf_y_true, tf_y_pred, from_logits=from_logits)
        loss = K.mean(tf.multiply(ce, weights_v))
        return loss

    return weighted_cross_entropy_fn

In [48]:

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_PRcurve', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

model = Sequential([
    LSTM(9, activation="sigmoid", return_sequences=True, input_shape=train_features.shape[1:3]),
    Dense(1, activation="sigmoid") 
])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2),
              loss=weighted_binary_cross_entropy(weights= class_weight,),
              metrics=metrics
)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 8427, 9)           684       
                                                                 
 dense_3 (Dense)             (None, 8427, 1)           10        
                                                                 
Total params: 694
Trainable params: 694
Non-trainable params: 0
_________________________________________________________________


In [49]:
history = model.fit(
    x=train_features, y=train_labels, batch_size=batch_size, epochs=epochs, verbose='auto', 
    validation_data=(val_features, val_labels), shuffle=True, 
    steps_per_epoch=steps_per_epoch, validation_steps=validation_steps,
    validation_batch_size=validation_batch_size,
    workers=workers, use_multiprocessing=use_multiprocessing, 
    callbacks=[early_stopping]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

KeyboardInterrupt: 