<a href="https://colab.research.google.com/github/ang-bill/IU-DLMDSME01-Credit-Card-Fraud-Detection/blob/main/Task1_Credit_Card_Fraud_Detection_Classifier_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 2. Classifier 1

## Section 2A. Retrieve Dataset from Kaggle Hub
At the first run, the dataset is downloaded from Kaggle and stored locally. Subsequent runs check whether the file already exists.
See: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data


In [1]:
import os
import pandas as pd # Pandas dataframe
import kagglehub # Kagglehub to access dataset
import shutil # Util for copying files
from google.colab import drive # Import Google Drive utilities

# Mount Google Drive for persistent storage
drive.mount('/content/drive')
local_storage_base_dir = "/content/drive/MyDrive/Colab_Kaggle_Data"

# Dataset details
kaggle_dataset_id = "mlg-ulb/creditcardfraud"
file_name_in_dataset = "creditcard.csv"

# Construct the full path to locally stored dataset
local_dataset_dir = os.path.join(local_storage_base_dir, *kaggle_dataset_id.split('/'))
full_local_file_path = os.path.join(local_dataset_dir, file_name_in_dataset)

# Ensure the desired local storage directory exists
os.makedirs(local_dataset_dir, exist_ok=True)

df = None # Initialize pandas df

# Check if the file already exists in local storage, otherwise download from Kaggle
if os.path.exists(full_local_file_path):
    print(f"'{file_name_in_dataset}' found locally at '{full_local_file_path}'. Loading from there.")
else:
    print(f"'{file_name_in_dataset}' not found locally. Attempting to download from KaggleHub and store it.")

    # Use kagglehub.dataset_download to get the dataset.
    downloaded_source_root = kagglehub.dataset_download(kaggle_dataset_id)

    # Construct the path to the file within the KaggleHub download location
    source_file_path = os.path.join(downloaded_source_root, file_name_in_dataset)

    if os.path.exists(source_file_path):
        print(f"Dataset found at KaggleHub resolved location: '{source_file_path}'.")
        print(f"Copying '{file_name_in_dataset}' to local path: '{full_local_file_path}'.")

        # Copy the file to local storage location
        shutil.copy(source_file_path, full_local_file_path)

    else:
        raise FileNotFoundError(f"Failed to find '{file_name_in_dataset}' at source '{source_file_path}' after KaggleHub download resolution.")

# Load the dataset into a pandas dataframe
df = pd.read_csv(full_local_file_path)


Mounted at /content/drive
'creditcard.csv' found locally at '/content/drive/MyDrive/Colab_Kaggle_Data/mlg-ulb/creditcardfraud/creditcard.csv'. Loading from there.


## Section 2B. Implementation of Classifier 1


In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score

# 1. SETUP: Generate synthetic imbalanced data for demonstration
# Imagine this is your fraud dataset: 10,000 rows, only 1% anomalies
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=10, n_informative=5,
                           n_redundant=0, weights=[0.99, 0.01], random_state=42)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Original Training Distribution: {np.bincount(y_train)}")

# ==============================================================================
# PHASE 1: Unsupervised Representation Learning (Generating TOS)
# ==============================================================================
# We train unsupervised detectors on the FULL imbalanced X_train.
# They need the density of the majority class to understand what is "normal."

print("Step 1: Generating Transformed Outlier Scores (TOS)...")

# Define the unsupervised detectors (Components of XGBOD)
detectors = [KNN(n_neighbors=5), LOF(n_neighbors=5), KNN(n_neighbors=20)]
X_train_tos_list = []
X_test_tos_list = []

for clf in detectors:
    clf.fit(X_train) # Fit on full data

    # Get outlier scores (decision_scores_ usually returns raw anomaly scores)
    # We reshape to (n_samples, 1) to stack them later
    train_scores = clf.decision_scores_.reshape(-1, 1)
    test_scores = clf.decision_function(X_test).reshape(-1, 1)

    X_train_tos_list.append(train_scores)
    X_test_tos_list.append(test_scores)

# Concatenate original features with new TOS features
X_train_augmented = np.hstack([X_train] + X_train_tos_list)
X_test_augmented = np.hstack([X_test] + X_test_tos_list)

print(f"Feature Space Augmented: {X_train.shape[1]} -> {X_train_augmented.shape[1]} features")

# ==============================================================================
# PHASE 2: The "Inter-Phase" (Applying RUS)
# ==============================================================================
# NOW we apply Random Undersampling.
# We do this AFTER the outlier scores are generated, but BEFORE the classifier learns.

print("Step 2: Applying Random Undersampling (RUS)...")

rus = RandomUnderSampler(random_state=42, sampling_strategy=1.0) # 1:1 ratio
X_train_rus, y_train_rus = rus.fit_resample(X_train_augmented, y_train)

print(f"Resampled Training Distribution: {np.bincount(y_train_rus)}")

# ==============================================================================
# PHASE 3: Supervised Learning (XGBoost)
# ==============================================================================
# Train the final classifier on the balanced, feature-rich data

print("Step 3: Training XGBoost...")

# Note: scale_pos_weight is usually not needed if we use RUS to 1:1 balance,
# but checking hyperparams is always good.
xgb = XGBClassifier(n_estimators=100, max_depth=3, eval_metric='logloss')
xgb.fit(X_train_rus, y_train_rus)

# ==============================================================================
# EVALUATION (F2 Score)
# ==============================================================================
# Predict on the test set (which is still imbalanced and un-sampled!)
y_pred = xgb.predict(X_test_augmented)

# Calculate F2 Score (beta=2 favors Recall)
f2 = fbeta_score(y_test, y_pred, beta=2)

print(f"--- Results ---")
print(f"Final F2-Score: {f2:.4f}")