In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder



# a. Load dataset
df = pd.read_csv("../data/transactions_training_sept_oct_2023.csv", sep=';', decimal=',')



# b. Parse datetime
df['DATETIME_GMT'] = pd.to_datetime(df['DATETIME_GMT'])

# c. Feature engineering
df['hour'] = df['DATETIME_GMT'].dt.hour
df['day_of_week'] = df['DATETIME_GMT'].dt.dayofweek

df.columns

Index(['ID_TRX', 'ID_CARD', 'DATETIME_GMT', 'AMOUNT', 'Anomaly_amount_1',
       'Anomaly_amount_2', 'Anomaly_amount_3', 'Anomaly_amount_4',
       'Anomaly_amount_5', 'Anomaly_amount_6', 'Anomaly_amount_7',
       'Anomaly_amount_8', 'FLAG_BEHAVIOUR_Anomaly_1',
       'FLAG_BEHAVIOUR_Anomaly_2', 'FLAG_BEHAVIOUR_Anomaly_3',
       'FLAG_BEHAVIOUR_Anomaly_4', 'FLAG_BEHAVIOUR_Anomaly_5',
       'FLAG_BEHAVIOUR_Anomaly_6', 'FLAG_BEHAVIOUR_Anomaly7',
       'FLAG_BEHAVIOUR_Anomaly_8', 'Anomaly_amount_9', 'Population_Anomaly_1',
       'Population_Anomaly_2', 'Population_Anomaly_3', 'Population_Anomaly_4',
       'Population_Anomaly_5', 'Population_Anomaly_6', 'Population_Anomaly_7',
       'Population_Anomaly_8', 'FLAG_FRAUD', 'hour', 'day_of_week'],
      dtype='object')

In [216]:
df = df.sort_values(by=['ID_CARD', 'DATETIME_GMT'])
df['time_since_last_transaction'] = df.groupby('ID_CARD')['DATETIME_GMT'].diff().dt.total_seconds()
df['diff_to_last_transaction'] = df.groupby('ID_CARD')['AMOUNT'].diff()
df

Unnamed: 0,ID_TRX,ID_CARD,DATETIME_GMT,AMOUNT,Anomaly_amount_1,Anomaly_amount_2,Anomaly_amount_3,Anomaly_amount_4,Anomaly_amount_5,Anomaly_amount_6,...,Population_Anomaly_4,Population_Anomaly_5,Population_Anomaly_6,Population_Anomaly_7,Population_Anomaly_8,FLAG_FRAUD,hour,day_of_week,time_since_last_transaction,diff_to_last_transaction
1890,196436685,17225409,2023-09-17 15:21:00,42.0,0.190476,0.020408,0.251323,0.763393,0.440917,,...,1.911244,1.796971,1.810102,-0.134921,-0.1,0,15,6,,
6479,196673339,17257728,2023-09-19 13:20:00,50.0,0.958333,0.761905,0.612500,0.346131,0.169171,,...,1.445445,1.349456,1.360486,1.333333,1.0,0,13,1,,
6649,196681534,17257728,2023-09-19 14:28:00,100.0,-0.020833,-0.119048,-0.193750,-0.326934,-0.415414,,...,0.222722,0.174728,0.180243,0.166667,0.0,0,14,1,4080.0,50.0
8076,196751655,17257728,2023-09-20 11:35:00,50.0,0.958333,0.761905,0.612500,0.346131,0.169171,,...,1.445445,1.349456,1.360486,0.333333,1.0,0,11,2,76020.0,-50.0
25807,197669941,17257728,2023-09-27 10:04:00,50.0,0.958333,0.761905,0.612500,0.346131,0.169171,-0.137061,...,1.445445,1.349456,1.360486,0.333333,0.8,0,10,2,599340.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78245,200401267,62196750,2023-10-16 15:36:00,20.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,,...,5.718217,5.540399,5.541565,-1.000000,-1.0,0,15,0,,
78014,200389215,62202209,2023-10-16 14:40:00,100.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,,...,0.343643,0.308080,0.308313,-1.000000,-1.0,0,14,0,,
79910,200487322,62203543,2023-10-17 10:57:00,20.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,1.000000,...,18.741509,18.612993,18.914795,-1.000000,-1.0,0,10,1,,
79991,200491787,62207463,2023-10-17 11:32:00,100.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,,...,2.948302,2.922599,2.982959,-1.000000,-1.0,0,11,1,,


In [217]:
# List of columns to convert to numeric
num_cols = ['AMOUNT', 'time_since_last_transaction', 'diff_to_last_transaction',
            'Anomaly_amount_1', 'Anomaly_amount_2', 'Anomaly_amount_3', 'Anomaly_amount_4', 'Anomaly_amount_5',
            'Anomaly_amount_6', 'Anomaly_amount_7', 'Anomaly_amount_8', 'Anomaly_amount_9',
            'Population_Anomaly_1', 'Population_Anomaly_2', 'Population_Anomaly_3', 'Population_Anomaly_4',
            'Population_Anomaly_5', 'Population_Anomaly_6', 'Population_Anomaly_7', 'Population_Anomaly_8']

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print(df['Population_Anomaly_8'])


1890    -0.1
6479     1.0
6649     0.0
8076     1.0
25807    0.8
        ... 
78245   -1.0
78014   -1.0
79910   -1.0
79991   -1.0
79990   -1.0
Name: Population_Anomaly_8, Length: 80000, dtype: float64


In [218]:
y = df['FLAG_FRAUD']
df['day_of_week'] = df['DATETIME_GMT'].dt.dayofweek.astype('category')
df['hour'] = df['DATETIME_GMT'].dt.hour.astype('category')
# Encode with OrdinalEncoder
from sklearn.preprocessing import OrdinalEncoder

ordinal = OrdinalEncoder()
df[['day_of_week', 'hour']] = ordinal.fit_transform(df[['day_of_week', 'hour']])

selected_cols = ['ID_CARD', "time_since_last_transaction", 'diff_to_last_transaction', 'AMOUNT', 'Anomaly_amount_1',
       'Anomaly_amount_2', 'Anomaly_amount_3', 'Anomaly_amount_4',
       'Anomaly_amount_5', 'Anomaly_amount_6', 'Anomaly_amount_7',
       'Anomaly_amount_8', 'FLAG_BEHAVIOUR_Anomaly_1',
       'FLAG_BEHAVIOUR_Anomaly_2', 'FLAG_BEHAVIOUR_Anomaly_3',
       'FLAG_BEHAVI'
       'OUR_Anomaly_6', 'FLAG_BEHAVIOUR_Anomaly7',
       'FLAG_BEHAVIOUR_Anomaly_8', 'Anomaly_amount_9', 'Population_Anomaly_1',
       'Population_Anomaly_2', 'Population_Anomaly_3', 'Population_Anomaly_4',
       'Population_Anomaly_5', 'Population_Anomaly_6', 'Population_Anomaly_7',
       'Population_Anomaly_8', 'hour', 'day_of_week']
X = df[selected_cols]


In [219]:
from sklearn.preprocessing import StandardScaler
# List of columns to standardize


scaler = StandardScaler()
df[selected_cols[1:]] = scaler.fit_transform(df[selected_cols[1:]])


In [220]:
from sklearn.model_selection import train_test_split

# X = your features DataFrame
# y = your target Series ('FLAG_FRAUD')

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,          # 20% test set
    stratify=y,             # maintain class ratio (fraud/non-fraud)
    random_state=1
)



In [221]:
df = df.sort_values('DATETIME_GMT')

split_idx = int(len(df) * 0.8)

train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

X_train = train_df[selected_cols]
y_train = train_df['FLAG_FRAUD']

X_test = test_df[selected_cols]
y_test = test_df['FLAG_FRAUD']

In [222]:
import xgboost as xgb
from sklearn.metrics import f1_score

# 1. Define the model, handling class imbalance with scale_pos_weight
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=1
)

# 2. Fit the model on training data
model.fit(X_train, y_train)

# 3. Predict probabilities on test set
y_pred_proba = model.predict_proba(X_test)[:, 1]

# 4. Use default 0.5 threshold to get predicted classes
y_pred = (y_pred_proba > 0.5).astype(int)

# 5. Calculate and print F1 score
score = f1_score(y_test, y_pred)
print(f"Test F1 score: {score:.4f}")


Test F1 score: 0.5641


In [223]:
from sklearn.metrics import f1_score

best_threshold = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.01):
    preds = (y_pred_proba > t).astype(int)
    f1 = f1_score(y_test, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold: {best_threshold:.2f}, Best F1: {best_f1:.4f}")


Best threshold: 0.16, Best F1: 0.6087


In [224]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
import xgboost as xgb

# Define number of folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Handle class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    
    # Define and train model
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight,
        max_depth=5,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Predict probabilities and get best threshold
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    best_threshold = 0.5
    best_f1 = 0
    for t in np.arange(0.1, 0.9, 0.01):
        preds = (y_pred_proba > t).astype(int)
        f1 = f1_score(y_val, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    print(f"Fold {fold + 1}: Best threshold = {best_threshold:.2f}, F1 score = {best_f1:.4f}")
    f1_scores.append(best_f1)

# Summary
print(f"\nMean F1 score: {np.mean(f1_scores):.4f}")
print(f"Std F1 score: {np.std(f1_scores):.4f}")


Fold 1: Best threshold = 0.55, F1 score = 0.4348
Fold 2: Best threshold = 0.85, F1 score = 0.4545
Fold 3: Best threshold = 0.88, F1 score = 0.5455
Fold 4: Best threshold = 0.25, F1 score = 0.4865
Fold 5: Best threshold = 0.66, F1 score = 0.8696

Mean F1 score: 0.5582
Std F1 score: 0.1601
