In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, auc, classification_report


In [29]:

# Load preprocessed datasets
fraud_data = pd.read_csv('../data/preprocessed/Fraud_Data_preprocessed.csv')
creditcard_data = pd.read_csv('../data/preprocessed/creditcard_preprocessed.csv')


  fraud_data = pd.read_csv('../data/preprocessed/Fraud_Data_preprocessed.csv')


In [30]:
df_orig = pd.read_csv('../data/creditcard.csv')
creditcard_data = pd.read_csv('../data/preprocessed/creditcard_preprocessed.csv')

# Add the 'Class' column from the original to the preprocessed
creditcard_data['Class'] = df_orig['Class']

# Save the updated preprocessed file
creditcard_data.to_csv('../data/preprocessed/creditcard_preprocessed.csv', index=False)


In [31]:
print(creditcard_data.columns.tolist())

['Time', 'Amount', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Class']


In [33]:
# Drop rows where y_fraud is NaN
fraud_data_clean = fraud_data.dropna(subset=['class'])
fraud_features = [col for col in fraud_data_clean.columns if col not in ['class', 'user_id', 'device_id', 'signup_time', 'purchase_time', 'ip_address', 'ip_address_int']]
X_fraud = fraud_data_clean[fraud_features]
y_fraud = fraud_data_clean['class']

X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42)


In [35]:
# --- Creditcard_Data ---
credit_features = [col for col in creditcard_data.columns if col != 'Class']
X_credit = creditcard_data[credit_features]
y_credit = creditcard_data['Class']

X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42)


In [36]:

def evaluate_model(model, X_test, y_test, dataset_name):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n--- {dataset_name} ---")
    print(f"AUC-PR: {auc_pr:.4f}")
    print(f"F1-score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", classification_report(y_test, y_pred))


In [38]:
X_train_fraud = X_train_fraud.fillna(0)
X_test_fraud = X_test_fraud.fillna(0)
X_train_credit = X_train_credit.fillna(0)
X_test_credit = X_test_credit.fillna(0)


In [39]:

# --- Random Forest ---
print("\nTraining Random Forest...")
rf_fraud = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_fraud.fit(X_train_fraud, y_train_fraud)
evaluate_model(rf_fraud, X_test_fraud, y_test_fraud, "Fraud_Data (Random Forest)")

rf_credit = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_credit.fit(X_train_credit, y_train_credit)
evaluate_model(rf_credit, X_test_credit, y_test_credit, "Creditcard_Data (Random Forest)")
print("\nModel comparison complete. Choose the model with the highest AUC-PR and F1-score for best performance on your task.")


Training Random Forest...

--- Fraud_Data (Random Forest) ---
AUC-PR: 0.9270
F1-score: 0.8714
Confusion Matrix:
 [[26386  1006]
 [ 4348 18132]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     27392
         1.0       0.95      0.81      0.87     22480

    accuracy                           0.89     49872
   macro avg       0.90      0.88      0.89     49872
weighted avg       0.90      0.89      0.89     49872


--- Creditcard_Data (Random Forest) ---
AUC-PR: 0.0029
F1-score: 0.0000
Confusion Matrix:
 [[56644     4]
 [   98     0]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56648
           1       0.00      0.00      0.00        98

    accuracy                           1.00     56746
   macro avg       0.50      0.50      0.50     56746
weighted avg       1.00      1.00      1.00     56746


Model comparison complete. Ch