In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load dataset
file_path = "/home/ducanh/Credit Card Transactions Fraud Detection/fraudTrain.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)

# Preprocessing
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time']).apply(lambda x: x.timestamp())
label_encoder = LabelEncoder()
data['category'] = label_encoder.fit_transform(data['category'])
scaler = MinMaxScaler()
data[['amt']] = scaler.fit_transform(data[['amt']])

# Create sequences and compute aggregated features
seq_len = 5
sequences = []
labels = []

grouped = data.groupby('cc_num')
for _, group in grouped:
    group = group[['category', 'amt', 'is_fraud', 'trans_date_trans_time']].values
    for i in range(len(group)):
        if i < seq_len - 1:
            padding = [group[0]] * (seq_len - i - 1)
            seq = padding + group[:i + 1].tolist()
        else:
            seq = group[i - seq_len + 1:i + 1].tolist()

        label = group[i, -2]  # Fraud label of the current transaction
        time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1]).reshape(-1, 1)
        seq_features = np.array([s[:-1] for s in seq])
        seq_features = np.concatenate((seq_features, time_intervals), axis=1)

        category_mean = seq_features[:, 0].mean()
        amt_mean = seq_features[:, 1].mean()
        delta_t_mean = seq_features[:, 2].mean()
        sequences.append([category_mean, amt_mean, delta_t_mean])
        labels.append(label)

# Create DataFrame
processed_data = pd.DataFrame(sequences, columns=['category_mean', 'amt_mean', 'delta_t_mean'])
processed_data['label'] = labels

# Split data
X = processed_data[['category_mean', 'amt_mean', 'delta_t_mean']]
y = processed_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store model results
results = {}

# 1. Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
results['Random Forest'] = accuracy_score(y_test, y_pred_rf)

# 2. Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
results['Logistic Regression'] = accuracy_score(y_test, y_pred_lr)

# 3. Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
results['Decision Tree'] = accuracy_score(y_test, y_pred_dt)

# Print model accuracies
print("Model Accuracy Results:")
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")

# Print detailed classification report for Random Forest
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

Model Accuracy Results:
Random Forest: 0.9983
Logistic Regression: 0.9979
Decision Tree: 0.9982

Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    257786
         1.0       0.85      0.86      0.86      1549

    accuracy                           1.00    259335
   macro avg       0.93      0.93      0.93    259335
weighted avg       1.00      1.00      1.00    259335

