In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
data_path = 'D:/University documents/Year 4/Data Science Challange/transactions_training_sept_oct_2023(1).csv'
train = pd.read_csv(
    data_path,
    sep=";",               # séparateur de colonnes
    decimal=",",           # virgule comme séparateur décimal
    na_values="NA",        # valeur manquante
    parse_dates=["DATETIME_GMT"]  # convertir la colonne date
)

test_data_path = 'D:/University documents/Year 4/Data Science Challange/transactions_unclassified_2023(1).csv'
test = pd.read_csv(
    test_data_path,
    sep=";",               # séparateur de colonnes
    decimal=",",           # virgule comme séparateur décimal
    na_values="NA",        # valeur manquante
    parse_dates=["DATETIME_GMT"]  # convertir la colonne date
)

In [3]:
print(train["FLAG_FRAUD"].value_counts())
print(train["FLAG_FRAUD"].value_counts(normalize=True) * 100)

FLAG_FRAUD
0    79908
1       92
Name: count, dtype: int64
FLAG_FRAUD
0    99.885
1     0.115
Name: proportion, dtype: float64


In [4]:
train.isna().sum()

ID_TRX                          0
ID_CARD                         0
DATETIME_GMT                    0
AMOUNT                          0
Anomaly_amount_1                0
Anomaly_amount_2                0
Anomaly_amount_3                0
Anomaly_amount_4                0
Anomaly_amount_5                0
Anomaly_amount_6                0
Anomaly_amount_7                0
Anomaly_amount_8                0
FLAG_BEHAVIOUR_Anomaly_1        0
FLAG_BEHAVIOUR_Anomaly_2        0
FLAG_BEHAVIOUR_Anomaly_3        0
FLAG_BEHAVIOUR_Anomaly_4        0
FLAG_BEHAVIOUR_Anomaly_5        0
FLAG_BEHAVIOUR_Anomaly_6        0
FLAG_BEHAVIOUR_Anomaly7         0
FLAG_BEHAVIOUR_Anomaly_8        0
Anomaly_amount_9            19816
Population_Anomaly_1            0
Population_Anomaly_2            0
Population_Anomaly_3            0
Population_Anomaly_4            0
Population_Anomaly_5            0
Population_Anomaly_6            0
Population_Anomaly_7            0
Population_Anomaly_8            0
FLAG_FRAUD    

In [5]:
test.isna().sum()

ID_TRX                         0
ID_CARD                        0
DATETIME_GMT                   0
AMOUNT                         0
Anomaly_amount_1               0
Anomaly_amount_2               0
Anomaly_amount_3               0
Anomaly_amount_4               0
Anomaly_amount_5               0
Anomaly_amount_6               0
Anomaly_amount_7               0
Anomaly_amount_8               0
FLAG_BEHAVIOUR_Anomaly_1       0
FLAG_BEHAVIOUR_Anomaly_2       0
FLAG_BEHAVIOUR_Anomaly_3       0
FLAG_BEHAVIOUR_Anomaly_4       0
FLAG_BEHAVIOUR_Anomaly_5       0
FLAG_BEHAVIOUR_Anomaly_6       0
FLAG_BEHAVIOUR_Anomaly7        0
FLAG_BEHAVIOUR_Anomaly_8       0
Anomaly_amount_9            2839
Population_Anomaly_1           0
Population_Anomaly_2           0
Population_Anomaly_3           0
Population_Anomaly_4           0
Population_Anomaly_5           0
Population_Anomaly_6           0
Population_Anomaly_7           0
Population_Anomaly_8           0
dtype: int64

In [6]:
# On trie par carte + date pour bien aligner les historiques
train = train.sort_values(by=["ID_CARD", "DATETIME_GMT"])

# Calcul du temps écoulé depuis la dernière transaction (en secondes)
train["time_since_last_txn"] = train.groupby("ID_CARD")["DATETIME_GMT"].diff().dt.total_seconds()

# Calcul de la différence d'amount depuis la dernière transaction
train["amount_diff_since_last_txn"] = train.groupby("ID_CARD")["AMOUNT"].diff()

train["hour"] = train["DATETIME_GMT"].dt.hour
train["dayofweek"] = train["DATETIME_GMT"].dt.dayofweek
train["is_night"] = train["hour"].apply(lambda h: 1 if h < 6 or h >= 22 else 0)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

In [8]:
# 1. Séparation X / y
X = train.drop(columns=["ID_TRX", "ID_CARD", "FLAG_FRAUD","DATETIME_GMT"])
y = train["FLAG_FRAUD"].astype(int)  # au cas où c’est encore un str
weight = (y == 0).sum() / (y == 1).sum()

# Tri du dataset par date croissante
sorted_df = train.sort_values("DATETIME_GMT").reset_index(drop=True)

# Taille du split (20% en test)
split_idx = int(len(sorted_df) * 0.8)

# Séparation manuelle
train_df = sorted_df.iloc[:split_idx]
test_df  = sorted_df.iloc[split_idx:]

# Séparation X / y
X_train = train_df.drop(columns=["ID_TRX", "ID_CARD", "FLAG_FRAUD","DATETIME_GMT"])
y_train = train_df["FLAG_FRAUD"].astype(int)

X_test = test_df.drop(columns=["ID_TRX", "ID_CARD", "FLAG_FRAUD","DATETIME_GMT"])
y_test = test_df["FLAG_FRAUD"].astype(int)

In [9]:
# 3. Initialise the XGBoost model
model = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",  # évite warning
    n_estimators=1000,
    scale_pos_weight=weight,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [10]:
from sklearn.model_selection import RandomizedSearchCV

# 4. Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'max_depth': [5, 6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

search = RandomizedSearchCV(
    model, param_dist,
    n_iter=50, scoring='f1', cv=5, n_jobs=-1, random_state=42
)

search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Best CV F1:", search.best_score_)

Parameters: { "use_label_encoder" } are not used.



Best params: {'subsample': 0.8, 'max_depth': 5, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
Best CV F1: 0.6647619047619048


In [11]:
best_model = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="aucpr",  # for imbalanced datasets
    n_estimators=1000,
    scale_pos_weight=weight,
    random_state=42,
    **search.best_params_,   # automatically apply best found hyperparameters
    early_stopping_rounds=50  # stop early if no improvement
)

best_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

# 5. Prédictions (avec seuil par défaut = 0.5)
y_pred = best_model.predict(X_test)

[0]	validation_0-aucpr:0.08119
[1]	validation_0-aucpr:0.22355
[2]	validation_0-aucpr:0.26481
[3]	validation_0-aucpr:0.42998
[4]	validation_0-aucpr:0.43051
[5]	validation_0-aucpr:0.42869
[6]	validation_0-aucpr:0.43085
[7]	validation_0-aucpr:0.46262
[8]	validation_0-aucpr:0.46440
[9]	validation_0-aucpr:0.46361
[10]	validation_0-aucpr:0.45376
[11]	validation_0-aucpr:0.45417
[12]	validation_0-aucpr:0.45373
[13]	validation_0-aucpr:0.45374
[14]	validation_0-aucpr:0.45339
[15]	validation_0-aucpr:0.45503
[16]	validation_0-aucpr:0.47769
[17]	validation_0-aucpr:0.47770
[18]	validation_0-aucpr:0.47802
[19]	validation_0-aucpr:0.48874
[20]	validation_0-aucpr:0.49008
[21]	validation_0-aucpr:0.48922
[22]	validation_0-aucpr:0.50895
[23]	validation_0-aucpr:0.50956
[24]	validation_0-aucpr:0.51179
[25]	validation_0-aucpr:0.51212
[26]	validation_0-aucpr:0.51240
[27]	validation_0-aucpr:0.51171
[28]	validation_0-aucpr:0.51098
[29]	validation_0-aucpr:0.74277
[30]	validation_0-aucpr:0.74276
[31]	validation_0-

Parameters: { "use_label_encoder" } are not used.



[34]	validation_0-aucpr:0.74182
[35]	validation_0-aucpr:0.74317
[36]	validation_0-aucpr:0.74320
[37]	validation_0-aucpr:0.74262
[38]	validation_0-aucpr:0.74257
[39]	validation_0-aucpr:0.74236
[40]	validation_0-aucpr:0.74380
[41]	validation_0-aucpr:0.74444
[42]	validation_0-aucpr:0.74020
[43]	validation_0-aucpr:0.73974
[44]	validation_0-aucpr:0.73947
[45]	validation_0-aucpr:0.74340
[46]	validation_0-aucpr:0.74443
[47]	validation_0-aucpr:0.74423
[48]	validation_0-aucpr:0.74462
[49]	validation_0-aucpr:0.59109
[50]	validation_0-aucpr:0.59206
[51]	validation_0-aucpr:0.64673
[52]	validation_0-aucpr:0.64686
[53]	validation_0-aucpr:0.64719
[54]	validation_0-aucpr:0.64856
[55]	validation_0-aucpr:0.64853
[56]	validation_0-aucpr:0.64800
[57]	validation_0-aucpr:0.68617
[58]	validation_0-aucpr:0.71769
[59]	validation_0-aucpr:0.76757
[60]	validation_0-aucpr:0.77264
[61]	validation_0-aucpr:0.81733
[62]	validation_0-aucpr:0.81773
[63]	validation_0-aucpr:0.81800
[64]	validation_0-aucpr:0.80172
[65]	val

In [12]:
# 6. Évaluation
print(classification_report(y_test, y_pred, digits=4))
print("F1-score:", f1_score(y_test, y_pred))

y_train_pred = best_model.predict(X_train)

# Évaluation sur train
print("\n=== Performance sur le train set ===")
print(classification_report(y_train, y_train_pred, digits=4))
print("F1-score (train) :", f1_score(y_train, y_train_pred))

              precision    recall  f1-score   support

           0     0.9997    0.9997    0.9997     15972
           1     0.8214    0.8214    0.8214        28

    accuracy                         0.9994     16000
   macro avg     0.9106    0.9106    0.9106     16000
weighted avg     0.9994    0.9994    0.9994     16000

F1-score: 0.8214285714285714

=== Performance sur le train set ===
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     63936
           1     0.9552    1.0000    0.9771        64

    accuracy                         1.0000     64000
   macro avg     0.9776    1.0000    0.9885     64000
weighted avg     1.0000    1.0000    1.0000     64000

F1-score (train) : 0.9770992366412213


In [14]:
# Proportion de la classe 1 dans le train
target_rate = (y_train == 1).mean()

# Probabilités sur test
y_test_proba = best_model.predict_proba(X_test)[:, 1]

# Seuil tel que top X% deviennent 1
threshold_quantile = np.quantile(y_test_proba, 1 - target_rate)
y_test_pred_forced_rate = (y_test_proba > threshold_quantile).astype(int)

print(f"Seuil forcé pour respecter {target_rate:.4%} : {threshold_quantile:.4f}")
print("F1-score :", f1_score(y_test, y_test_pred_forced_rate))

Seuil forcé pour respecter 0.1000% : 0.9997
F1-score : 0.7272727272727273
