In [45]:
import os
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math
from transformers import AutoTokenizer, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
os.chdir('/Users/bachirzerroug/Documents/transformers-implementation')

In [3]:
from src.local_transformers import Encoder
from src.utils import EarlyStopping, CustomDataset, PreTrainedTokenizer

# Import data

In [208]:
tr_raw = pd.read_csv('src/data/transactions.csv')
tr_raw.columns = [col.lower() for col in tr_raw.columns]


In [209]:
tr_raw

Unnamed: 0,customer_id,tx_datetime,tx_amount,sector_id,tx_fraud
0,2541,2023-01-01 00:21:42,499.210846,11,False
1,1735,2023-01-01 00:21:59,58.824293,3,False
2,669,2023-01-01 00:36:40,91.967189,12,False
3,4235,2023-01-01 00:51:34,836.571053,16,False
4,2162,2023-01-01 01:15:23,1537.707662,19,False
...,...,...,...,...,...
1229916,3367,2023-06-18 13:37:35,1688.437989,19,False
1229917,488,2023-06-18 13:37:40,679.409104,8,False
1229918,292,2023-06-18 13:37:42,128.156666,2,False
1229919,2036,2023-06-18 13:37:42,953.330547,15,False


In [211]:
# Get datetime column
tr_raw['tx_datetime'] = pd.to_datetime(tr_raw['tx_datetime'])
df_tr = tr_raw.sort_values(by='tx_datetime', ascending=True)

# Convert True false to 0 and 1
df_tr['tx_fraud'] = df_tr['tx_fraud']*1

df_tr

Unnamed: 0,customer_id,tx_datetime,tx_amount,sector_id,tx_fraud
0,2541,2023-01-01 00:21:42,499.210846,11,0
1,1735,2023-01-01 00:21:59,58.824293,3,0
2,669,2023-01-01 00:36:40,91.967189,12,0
3,4235,2023-01-01 00:51:34,836.571053,16,0
4,2162,2023-01-01 01:15:23,1537.707662,19,0
...,...,...,...,...,...
1229916,3367,2023-06-18 13:37:35,1688.437989,19,0
1229917,488,2023-06-18 13:37:40,679.409104,8,0
1229918,292,2023-06-18 13:37:42,128.156666,2,0
1229919,2036,2023-06-18 13:37:42,953.330547,15,0


## Create time features

In [212]:
# Create timestamp column that will be used in the model
df_tr['timestamp'] = df_tr['tx_datetime'].astype(int) / 10 ** 9

In [213]:
# Create date diff columns
df_tr['date_diff'] = df_tr.groupby(["customer_id"])['tx_datetime'].transform(lambda x: x.diff().dt.seconds).fillna(-1)

In [214]:
# Cyclical encoding for cyclic time features
def cyclical_encoding(hour_of_day, day_of_week, month_of_year):
    # Apply cyclical encoding for hour of the day
    hour_sin = np.sin(2 * np.pi * hour_of_day)
    hour_cos = np.cos(2 * np.pi * hour_of_day)
    
    # Apply cyclical encoding for day of the week
    day_sin = np.sin(2 * np.pi * day_of_week)
    day_cos = np.cos(2 * np.pi * day_of_week)
    
    # Apply cyclical encoding for month of the year
    month_sin = np.sin(2 * np.pi * month_of_year)
    month_cos = np.cos(2 * np.pi * month_of_year)
    
    return hour_sin, hour_cos, day_sin, day_cos, month_sin, month_cos

# Extract cyclical time-related features and encode them
def extract_cyclical_time_features(datetime):
    # Extract cyclical time-related features
    hour_of_day = datetime.hour / 23
    day_of_week = datetime.weekday() / 7
    month_of_year = datetime.month / 12

    # Encode and return them    
    return cyclical_encoding(hour_of_day, day_of_week, month_of_year)

# Get all encoding for time related features
def encode_time_related_features(df_tr_datetime):
    # Extract cyclical time-related features
    array_cyclical_time_features = np.array(df_tr_datetime.apply(lambda x: extract_cyclical_time_features(x)))
    # Reshape array of tuple to bidimensional array
    return np.stack(array_cyclical_time_features)

#return np.concatenate((array_cyclical_time_features_stacked, array_interval[:, np.newaxis]), axis=1)

# Normalize interval features

In [216]:
A = encode_time_related_features(df_tr['tx_datetime'])
A.shape

(1229921, 6)

## Preprocessing

In [203]:
# Normalize AMOUNT feature
def normalize_amount(df_tr, scaler=None):
    # Apply standard scaling
    if scaler is None:
        scaler = StandardScaler()
        return scaler.fit_transform(df_tr[['tx_amount']]).flatten()
    else:
        return scaler.transform(df_tr[['tx_amount']]).flatten()    


# Normalize datediff feature
def normalize_datediff(df_tr, scaler=None):
    # Apply standard scaling
    if scaler is None:
        scaler = MinMaxScaler()
        return scaler.fit_transform(df_tr[['date_diff']]).flatten()
    else:
        return scaler.transform(df_tr[['date_diff']]).flatten()    
    

# Train Val Test split

In [233]:
#We keep last month as test set

test_set = df_tr[df_tr['tx_datetime'] > '2023-05-24']
val_set = df_tr[(df_tr['tx_datetime'] >= '2023-05-01' ) & (df_tr['tx_datetime'] <= '2023-05-24')]
train_set = df_tr[df_tr['tx_datetime'] < '2023-05-01']


assert round(np.sum(test_set['tx_fraud'])/len(test_set), 3) == round(np.sum(train_set['tx_fraud'])/len(train_set), 3)
assert round(np.sum(val_set['tx_fraud'])/len(val_set), 3) == round(np.sum(train_set['tx_fraud'])/len(train_set), 3)

In [None]:
features = ['customer_id', 'sector_id', 'tx_datetime', 'tx_amount', 'timestamp', 'date_diff']
target = 'tx_fraud'

In [None]:
X_train = train_set[features]
y_train = train_set[target]

X_val = val_set[features]
y_val = val_set[target]

X_test = test_set[features]
y_test = test_set[target]