In [3]:
import pandas as pd
from tensorflow import keras

In [4]:
dataset = pd.read_csv('Financial_datasets_log.csv')

In [5]:
dataset['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0.0,422827
1.0,217


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436601 entries, 0 to 436600
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            436601 non-null  int64  
 1   type            436601 non-null  object 
 2   amount          436601 non-null  float64
 3   nameOrig        436601 non-null  object 
 4   oldbalanceOrg   436601 non-null  float64
 5   newbalanceOrig  436601 non-null  float64
 6   nameDest        436600 non-null  object 
 7   oldbalanceDest  436600 non-null  float64
 8   newbalanceDest  436600 non-null  float64
 9   isFraud         436600 non-null  float64
 10  isFlaggedFraud  436600 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 36.6+ MB


In [None]:
dataset.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,436601.0,436601.0,436601.0,436601.0,436600.0,436600.0,436600.0,436600.0
mean,13.160291,170559.2,903422.9,923038.5,982574.0,1172844.0,0.000499,0.0
std,3.395281,280524.5,2971132.0,3008353.0,2344070.0,2530473.0,0.02234,0.0
min,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.0,13508.12,0.0,0.0,0.0,0.0,0.0,0.0
50%,13.0,81950.82,18342.43,0.0,109838.6,218101.0,0.0,0.0
75%,16.0,227637.4,173069.0,214205.5,888925.8,1213958.0,0.0,0.0
max,18.0,10000000.0,38939420.0,38946230.0,41482700.0,41482700.0,1.0,0.0


In [6]:
dataset = dataset.drop_duplicates()

In [7]:
dataset = dataset.dropna()

In [8]:
dataset = dataset[~dataset['nameDest'].str.startswith('M')]

In [9]:
dataset = dataset.drop(columns=['nameOrig', 'nameDest'])

In [10]:
dataset['isFraud'] = dataset['isFraud'].astype('int')
dataset['isFlaggedFraud'] = dataset['isFlaggedFraud'].astype('int')
dataset['type'] = dataset['type'].astype('category')

In [11]:
type_mapping = {
    'TRANSFER': 0,
    'CASH_OUT': 1,
    'DEBIT': 2,
    'CASH_IN': 3
}

In [13]:
dataset['type'] = dataset['type'].replace(type_mapping).astype(int)

In [14]:
dataset = dataset.drop(columns=['isFlaggedFraud'])

In [15]:
X = dataset.drop(columns=['isFraud'])
y = dataset['isFraud']

In [21]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=30)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [161]:
randomForest = RandomForestClassifier(
    n_estimators=300,
    max_depth=30,
    min_samples_split=25,
    min_samples_leaf=10,
    max_features='log2',
    criterion='gini',
    random_state=42
)

In [162]:
randomForest.fit(X_resampled, y_resampled)

In [164]:
y_scores = randomForest.predict_proba(X_test)[:, 1]

# Calculate the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

best_threshold = thresholds[np.argmax(precision * recall)]

y_pred_adjusted = (y_scores >= best_threshold).astype(int)

In [165]:
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     37416
           1       0.75      0.43      0.55        35

    accuracy                           1.00     37451
   macro avg       0.87      0.71      0.77     37451
weighted avg       1.00      1.00      1.00     37451



**XGBoost**

In [24]:
from xgboost import XGBClassifier

In [25]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=35,
    learning_rate=0.4,
    subsample=1.0,
    colsample_bytree=0.8,
    gamma=0.3,
    random_state=42
)

In [26]:
model.fit(X_resampled, y_resampled)

In [27]:
y_scores = model.predict_proba(X_test)[:, 1]

# Calculate the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

best_threshold = thresholds[np.argmax(precision * recall)]

y_pred_adjusted = (y_scores >= best_threshold).astype(int)

In [29]:
print("Classification Report:\n", classification_report(y_test, y_pred_adjusted))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56427
           1       0.92      0.56      0.70        43

    accuracy                           1.00     56470
   macro avg       0.96      0.78      0.85     56470
weighted avg       1.00      1.00      1.00     56470



In [30]:
import pickle

with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [31]:
from google.colab import files

files.download("xgboost_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Autoencoder**

In [171]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [172]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [173]:
X_normal = X_scaled[dataset["isFraud"] == 0]
X_fraud = X_scaled[dataset['isFraud'] == 1]

In [174]:
X_train, X_test = train_test_split(X_normal, test_size=0.2, random_state=42)

In [175]:
# Definition of the autoencoder architecture
input_dim = X_train.shape[1]

autoencoder = keras.Sequential([
    keras.layers.Input(shape=(input_dim,)),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(8, activation="relu"),  # Bottleneck layer (compression)
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(input_dim, activation="sigmoid")  # Reconstruction
])

In [176]:
autoencoder.compile(optimizer="adam", loss="mse")

In [177]:
autoencoder.fit(X_train, X_train,
                          epochs=50, batch_size=256,
                          validation_split=0.1, verbose=0)

<keras.src.callbacks.history.History at 0x787121633210>

In [178]:
# Evaluation of reconstruction error on normal and fraudulent transactions
X_test_pred = autoencoder.predict(X_test)
mse_normal = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

X_fraud_pred = autoencoder.predict(X_fraud)
mse_fraud = np.mean(np.power(X_fraud - X_fraud_pred, 2), axis=1)

[1m1170/1170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [219]:
threshold = np.percentile(mse_normal, 80)
print(f"Seuil d'anomalie: {threshold}")

Seuil d'anomalie: 2.799924787254589e-05


In [220]:
# Fraud detection
fraud_detected = mse_fraud > threshold

In [221]:
print(f"Nombre de fraudes détectées: {np.sum(fraud_detected)} sur {len(X_fraud)} transactions frauduleuses")

Nombre de fraudes détectées: 99 sur 177 transactions frauduleuses


In [230]:
y_pred_normal = (mse_normal > threshold).astype(int)
y_pred_fraud = (mse_fraud > threshold).astype(int)

In [232]:
y_true_normal = np.zeros_like(y_pred_normal)  # 0 for normal transactions
y_true_fraud = np.ones_like(y_pred_fraud)     # 1 for fraud

In [235]:
y_true = np.concatenate([y_true_normal, y_true_fraud])  # Real labels
y_pred = np.concatenate([y_pred_normal, y_pred_fraud])  # Model predictions

In [237]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print(f'Précision: {precision:.4f}')
print(f'Rappel: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')

Précision: 0.0131
Rappel: 0.5593
F1-score: 0.0255
Accuracy: 0.7989
