In [None]:
import pandas as pd
from tensorflow import keras

In [9]:
dataset = pd.read_csv('Financial_datasets_log.csv')

**Data Exploration and Manipulation**

In [11]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [12]:
dataset['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0.0,69750
1.0,107


In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69858 entries, 0 to 69857
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            69858 non-null  int64  
 1   type            69858 non-null  object 
 2   amount          69858 non-null  float64
 3   nameOrig        69858 non-null  object 
 4   oldbalanceOrg   69858 non-null  float64
 5   newbalanceOrig  69858 non-null  float64
 6   nameDest        69858 non-null  object 
 7   oldbalanceDest  69858 non-null  float64
 8   newbalanceDest  69857 non-null  float64
 9   isFraud         69857 non-null  float64
 10  isFlaggedFraud  69857 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 5.9+ MB


In [14]:
dataset.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,69858.0,69858.0,69858.0,69858.0,69858.0,69857.0,69857.0,69857.0
mean,7.893326,164017.6,904306.6,920298.0,854352.2,1185141.0,0.001532,0.0
std,1.874512,331373.0,2791616.0,2829881.0,2403000.0,2871791.0,0.039107,0.0
min,1.0,0.63,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,9011.012,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,44049.18,19908.5,0.0,14160.5,21130.68,0.0,0.0
75%,9.0,199130.7,172117.5,194020.5,518607.3,982411.3,0.0,0.0
max,9.0,10000000.0,33797390.0,34008740.0,31306920.0,31976990.0,1.0,0.0


In [15]:
dataset = dataset.drop_duplicates()

In [17]:
dataset = dataset.dropna()

In [18]:
dataset = dataset[~dataset['nameDest'].str.startswith('M')]

In [19]:
dataset = dataset.drop(columns=['nameOrig', 'nameDest'])

In [24]:
dataset = dataset.drop(columns=['isFlaggedFraud'])

In [20]:
dataset['isFraud'] = dataset['isFraud'].astype('int')
dataset['isFlaggedFraud'] = dataset['isFlaggedFraud'].astype('int')
dataset['type'] = dataset['type'].astype('category')

**Categorical Encoding**

In [21]:
type_mapping = {
    'TRANSFER': 0,
    'CASH_OUT': 1,
    'DEBIT': 2,
    'CASH_IN': 3
}

In [23]:
dataset['type'] = dataset['type'].replace(type_mapping).astype(int)

**Data partitioning**

In [25]:
X = dataset.drop(columns=['isFraud'])
y = dataset['isFraud']

In [26]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report, precision_recall_curve

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=30)

**Feature Scaling**

In [28]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**Oversampling**

In [29]:
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

**Random Forest**

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
randomForest = RandomForestClassifier(
    n_estimators=300,
    max_depth=30,
    min_samples_split=25,
    min_samples_leaf=10,
    max_features='log2',
    criterion='gini',
    random_state=42
)

In [32]:
randomForest.fit(X_resampled, y_resampled)

In [33]:
y_scores = randomForest.predict_proba(X_test)[:, 1]

# Calculate the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

best_threshold = thresholds[np.argmax(precision * recall)]

y_pred_adjusted = (y_scores >= best_threshold).astype(int)

In [34]:
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8204
           1       0.91      0.48      0.62        21

    accuracy                           1.00      8225
   macro avg       0.95      0.74      0.81      8225
weighted avg       1.00      1.00      1.00      8225



**XGBoost**

In [35]:
from xgboost import XGBClassifier

In [36]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=35,
    learning_rate=0.4,
    subsample=1.0,
    colsample_bytree=0.8,
    gamma=0.3,
    random_state=42
)

In [37]:
model.fit(X_resampled, y_resampled)

In [38]:
y_scores = model.predict_proba(X_test)[:, 1]

# Calculate the precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

best_threshold = thresholds[np.argmax(precision * recall)]

y_pred_adjusted = (y_scores >= best_threshold).astype(int)

In [40]:
print("Classification Report:\n", classification_report(y_test, y_pred_adjusted))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8204
           1       1.00      0.43      0.60        21

    accuracy                           1.00      8225
   macro avg       1.00      0.71      0.80      8225
weighted avg       1.00      1.00      1.00      8225



In [None]:
import pickle

with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
from google.colab import files

files.download("xgboost_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Autoencoder**

In [41]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [42]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [43]:
X_normal = X_scaled[dataset["isFraud"] == 0]
X_fraud = X_scaled[dataset['isFraud'] == 1]

In [44]:
X_train, X_test = train_test_split(X_normal, test_size=0.2, random_state=42)

In [54]:
# Definition of the autoencoder architecture
input_dim = X_train.shape[1]

autoencoder = keras.Sequential([
    keras.layers.Input(shape=(input_dim,)),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(8, activation="relu"),  # Bottleneck layer (compression)
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(input_dim, activation="sigmoid")  # Reconstruction
])

In [56]:
autoencoder.compile(optimizer="adam", loss="mse")

In [57]:
autoencoder.fit(X_train, X_train,
                          epochs=50, batch_size=256,
                          validation_split=0.1, verbose=0)

<keras.src.callbacks.history.History at 0x7f5f08568650>

In [58]:
# Evaluation of reconstruction error on normal and fraudulent transactions
X_test_pred = autoencoder.predict(X_test)
mse_normal = np.mean(np.power(X_test - X_test_pred, 2), axis=1)

X_fraud_pred = autoencoder.predict(X_fraud)
mse_fraud = np.mean(np.power(X_fraud - X_fraud_pred, 2), axis=1)

[1m257/257[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [59]:
threshold = np.percentile(mse_normal, 80)
print(f"Seuil d'anomalie: {threshold}")

Seuil d'anomalie: 0.00030078709354727594


In [60]:
# Fraud detection
fraud_detected = mse_fraud > threshold

In [61]:
print(f"Nombre de fraudes détectées: {np.sum(fraud_detected)} sur {len(X_fraud)} transactions frauduleuses")

Nombre de fraudes détectées: 46 sur 107 transactions frauduleuses


In [62]:
y_pred_normal = (mse_normal > threshold).astype(int)
y_pred_fraud = (mse_fraud > threshold).astype(int)

In [63]:
y_true_normal = np.zeros_like(y_pred_normal)  # 0 for normal transactions
y_true_fraud = np.ones_like(y_pred_fraud)     # 1 for fraud

In [64]:
y_true = np.concatenate([y_true_normal, y_true_fraud])  # Real labels
y_pred = np.concatenate([y_pred_normal, y_pred_fraud])  # Model predictions

In [65]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print(f'Précision: {precision:.4f}')
print(f'Rappel: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'Accuracy: {accuracy:.4f}')

Précision: 0.0273
Rappel: 0.4299
F1-score: 0.0513
Accuracy: 0.7952
