<a href="https://colab.research.google.com/github/ang-bill/IU-DLMDSME01-Credit-Card-Fraud-Detection/blob/main/Task1_Credit_Card_Fraud_Detection_Classifier_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 2. Classifier 1

## Section 2A. Retrieve Dataset from Kaggle Hub
At the first run, the dataset is downloaded from Kaggle and stored locally. Subsequent runs check whether the file already exists.
See: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data


In [8]:
import os
import pandas as pd # Pandas dataframe
import kagglehub # Kagglehub to access dataset
import shutil # Util for copying files
from google.colab import drive # Import Google Drive utilities

# Mount Google Drive for persistent storage
drive.mount('/content/drive')
local_storage_base_dir = "/content/drive/MyDrive/Colab_Kaggle_Data"

# Dataset details
kaggle_dataset_id = "mlg-ulb/creditcardfraud"
file_name_in_dataset = "creditcard.csv"

# Construct the full path to locally stored dataset
local_dataset_dir = os.path.join(local_storage_base_dir, *kaggle_dataset_id.split('/'))
full_local_file_path = os.path.join(local_dataset_dir, file_name_in_dataset)

# Ensure the desired local storage directory exists
os.makedirs(local_dataset_dir, exist_ok=True)

df = None # Initialize pandas df

# Check if the file already exists in local storage, otherwise download from Kaggle
if os.path.exists(full_local_file_path):
    print(f"'{file_name_in_dataset}' found locally at '{full_local_file_path}'. Loading from there.")
else:
    print(f"'{file_name_in_dataset}' not found locally. Attempting to download from KaggleHub and store it.")

    # Use kagglehub.dataset_download to get the dataset.
    downloaded_source_root = kagglehub.dataset_download(kaggle_dataset_id)

    # Construct the path to the file within the KaggleHub download location
    source_file_path = os.path.join(downloaded_source_root, file_name_in_dataset)

    if os.path.exists(source_file_path):
        print(f"Dataset found at KaggleHub resolved location: '{source_file_path}'.")
        print(f"Copying '{file_name_in_dataset}' to local path: '{full_local_file_path}'.")

        # Copy the file to local storage location
        shutil.copy(source_file_path, full_local_file_path)

    else:
        raise FileNotFoundError(f"Failed to find '{file_name_in_dataset}' at source '{source_file_path}' after KaggleHub download resolution.")

# Load the dataset into a pandas dataframe
df = pd.read_csv(full_local_file_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
'creditcard.csv' found locally at '/content/drive/MyDrive/Colab_Kaggle_Data/mlg-ulb/creditcardfraud/creditcard.csv'. Loading from there.


## Section 2B. Implementation of Classifier 1


### 1. Required Packages
(not included in default Colab Notebook)

In [3]:
pip install pyod

Collecting pyod
  Downloading pyod-2.0.6-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading pyod-2.0.6-py3-none-any.whl (204 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.7/204.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyod
Successfully installed pyod-2.0.6


### 2. Required Packages
(not included in default Colab Notebook)

In [6]:
from sklearn.pipeline import Pipeline

X = df.drop('Class', axis=1)  # features
y = df['Class'] # Labels

print(f"Dataset Shape: {X.shape}, Fraud Ratio: {np.mean(y):.4%}")

pipe = Pipeline(steps=[
   ('clf', LogisticRegression())])
pipe.fit(X, y)
pipe[:-1].get_feature_names_out()

array(['x2', 'x3'], dtype=object)

In [9]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, fbeta_score, f1_score, precision_score, recall_score, brier_score_loss
from sklearn.preprocessing import RobustScaler
#from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline # Supports resampling inside CV
from imblearn.under_sampling import RandomUnderSampler
from scipy.stats import loguniform

# --- 1. SETUP: Synthetic Data mimicking your EDA findings ---
#from sklearn.datasets import make_classification
# We generate data with 'Time' (0-172800 seconds) and 'Amount' features
#N_SAMPLES = 5000
#X, y = make_classification(n_samples=N_SAMPLES, n_features=28, n_informative=20,
#                           weights=[0.99828, 0.00172], # 0.172% minority
#                           random_state=42)

# Create DataFrame to simulate real columns
#cols = [f'V{i}' for i in range(1, 29)]
#df_X = pd.DataFrame(X, columns=cols)
# Add 'Time' (0 to 48 hours in seconds) and 'Amount' (with outliers)
#df_X['Time'] = np.random.randint(0, 172800, size=N_SAMPLES)
#df_X['Amount'] = np.random.exponential(scale=100, size=N_SAMPLES)
#X = df_X # Use DataFrame for the pipeline
X = df.drop('Class', axis=1)  # features
y = df['Class'] # Labels

print(f"Dataset Shape: {X.shape}, Fraud Ratio: {np.mean(y):.4%}")

# --- 2. CUSTOM COMPONENTS ---

class HourExtractor(BaseEstimator, TransformerMixin):
    """Extracts 'Hour' from 'Time' feature to capture diurnal patterns."""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        # Convert seconds to hour of day (0-23)
        if isinstance(X_copy, pd.DataFrame):
            X_copy['Hour'] = (X_copy['Time'] % (60*60*24)) // (60*60)
            #return X_copy.drop(columns=['Time']) # Replace Time with Hour
        return X_copy

def pozzolo_correction(probs, prior_pos_orig, prior_pos_sampled):
    """
    Applies Dal Pozzolo et al. (2015) calibration correction.
    Formula: P_calib = (gamma * P_s) / (gamma * P_s + P_s_neg)
    """
    if prior_pos_sampled == 0 or prior_pos_sampled == 1: return probs
    gamma = (prior_pos_orig / (1 - prior_pos_orig)) / (prior_pos_sampled / (1 - prior_pos_sampled))
    return (gamma * probs) / ((gamma * probs) + (1 - probs))

# --- 3. ABLATION CONFIGURATIONS ---
# We define distinct pipelines to test each component
configs = {
    "1. Naive (No Preproc)": {
        'scale': False, 'fe': False, 'rus': False, 'opt': False
    },
    "2. + Scaling (Robust)": {
        'scale': True, 'fe': False, 'rus': False, 'opt': False
    },
    "3. + Feature Eng (Hour)": {
        'scale': True, 'fe': True, 'rus': False, 'opt': False
    },
    "4. + RUS (Calibrated)": {
        'scale': True, 'fe': True, 'rus': True, 'opt': False
    },
    "5. + Optimization (Full)": {
        'scale': True, 'fe': True, 'rus': True, 'opt': True
    }
}

# --- 4. EXPERIMENTAL LOOP (Nested CV) ---
# Outer Loop: Repeated Stratified 5-Fold
outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)
# Inner Loop: Stratified 4-Fold (used inside RandomizedSearchCV)
inner_cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

results_table = []

print("Starting Ablation Study (this may take a moment)...")

for name, cfg in configs.items():
    print(f"Running Configuration: {name}")

    fold_metrics = {'f2': [], 'f1': [], 'rec': [], 'prec': []}

    # Outer CV loop (split training and test set)
    for i, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # --- A. Build Pipeline Steps ---
        steps = []

        # 1. Feature Engineering (Hour)
        if cfg['fe']:
            steps.append(('fe', HourExtractor()))
        #else:
            # Drop Time if not using FE (standard practice if raw Time is not useful)
        #    steps.append(('drop_time', ColumnTransformer([('drop', 'drop', ['Time'])], remainder='passthrough')))

        # 2. Scaling (RobustScaler)
        if cfg['scale']:
            # Apply robust scaler to Amount, pass through others
            # @TODO
            # Note: For simplicity in this demo, we apply to all numericals coming out of previous step
            steps.append(('scaler', RobustScaler()))

        # 3. Resampling (RUS)
        # Resampling in the pipeline preventes data leakage
        # Resampling is only applied to the traning fold inside
        # (https://imbalanced-learn.org/stable/common_pitfalls.html)
        if cfg['rus']:
            steps.append(('rus', RandomUnderSampler(sampling_strategy=1.0, random_state=42)))

        # 4. Classifier
        # SciKit-Learn LogisticRegression
        # (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
        # Regularization is applied by default
        # Solver liblinear: supports L1 and L2 regularization
        clf = LogisticRegression(solver='liblinear', random_state=42)
        #clf = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
        #if cfg['rus']:
             # If RUS is used, we don't need class_weight='balanced' usually,
             # but keeping it doesn't hurt. strictly, RUS handles the balance.
        #     clf = LogisticRegression(solver='liblinear', random_state=42)

        steps.append(('clf', clf))

        # Create pipeline from steps
        pipeline = ImbPipeline(steps)

        # --- B. Optimization (Inner Loop) ---
        if cfg['opt']:
            # Inner CV for optimisation
            # (CV uses a fold of the train set for validation)
            # Optimize for F2 Score

            # Define distribution of the tuneable parameters
            # print(clf.get_params()) # Print tunable parameters
            # Naming convention of parameter names: stepname__parameter
            # Here, the clf, the classifier of the pipeline is tuned
            param_dist = {
                'clf__C': loguniform(1e-4, 1e2) # Inverse of regularization strength
            }
            search = RandomizedSearchCV(pipeline, param_dist, n_iter=50,
                                        scoring=make_scorer(fbeta_score, beta=2),
                                        cv=inner_cv, n_jobs=-1, random_state=42)
            search.fit(X_train, y_train)
            model = search.best_estimator_
        else:
            pipeline.fit(X_train, y_train)
            model = pipeline

        # --- C. Prediction & Calibration ---
        # Get raw probabilities (biased if RUS was used)
        probs = model.predict_proba(X_test)[:, 1]

        # Apply Pozzolo Calibration ONLY if RUS was used
        if cfg['rus']:
            prior_pos_orig = np.mean(y_train == 1)
            prior_pos_sampled = 0.5 # We forced sampling_strategy=1.0
            probs = pozzolo_correction(probs, prior_pos_orig, prior_pos_sampled)

        # Convert to Hard Predictions (Threshold = 0.5)
        y_pred = (probs > 0.5).astype(int)

        # --- D. Record Metrics ---
        fold_metrics['f2'].append(fbeta_score(y_test, y_pred, beta=2))
        fold_metrics['f1'].append(f1_score(y_test, y_pred))
        fold_metrics['rec'].append(recall_score(y_test, y_pred))
        fold_metrics['prec'].append(precision_score(y_test, y_pred, zero_division=0))

    # Aggregate results for this configuration
    results_table.append({
        'Configuration': name,
        'F2 Score (Mean ± SD)': f"{np.mean(fold_metrics['f2']):.4f} ± {np.std(fold_metrics['f2']):.4f}",
        'F1 Score': f"{np.mean(fold_metrics['f1']):.4f}",
        'Recall': f"{np.mean(fold_metrics['rec']):.4f}",
        'Precision': f"{np.mean(fold_metrics['prec']):.4f}"
    })

# --- 5. OUTPUT ---
df_results = pd.DataFrame(results_table)
print("\n=== Ablation Study Results (Baseline: Logistic Regression) ===")
print(df_results.to_markdown(index=False))

Dataset Shape: (284807, 30), Fraud Ratio: 0.1727%
Starting Ablation Study (this may take a moment)...
Running Configuration: 1. Naive (No Preproc)
Running Configuration: 2. + Scaling (Robust)
Running Configuration: 3. + Feature Eng (Hour)
Running Configuration: 4. + RUS (Calibrated)
Running Configuration: 5. + Optimization (Full)

=== Ablation Study Results (Baseline: Logistic Regression) ===
| Configuration            | F2 Score (Mean ± SD)   |   F1 Score |   Recall |   Precision |
|:-------------------------|:-----------------------|-----------:|---------:|------------:|
| 1. Naive (No Preproc)    | 0.6195 ± 0.0538        |     0.6599 |   0.5959 |      0.7465 |
| 2. + Scaling (Robust)    | 0.6569 ± 0.0464        |     0.7238 |   0.6191 |      0.8756 |
| 3. + Feature Eng (Hour)  | 0.6576 ± 0.0444        |     0.7249 |   0.6195 |      0.8778 |
| 4. + RUS (Calibrated)    | 0.6978 ± 0.0669        |     0.5995 |   0.7967 |      0.4988 |
| 5. + Optimization (Full) | 0.6052 ± 0.0515        

2025-11-06:


```
Dataset Shape: (284807, 30), Fraud Ratio: 0.1727%
Starting Ablation Study (this may take a moment)...
Running Configuration: 1. Naive (No Preproc)
Running Configuration: 2. + Scaling (Robust)
Running Configuration: 3. + Feature Eng (Hour)
Running Configuration: 4. + RUS (Calibrated)
Running Configuration: 5. + Optimization (Full)
```


=== Ablation Study Results (Baseline: Logistic Regression) ===
| Configuration            | F2 Score (Mean ± SD)   |   F1 Score |   Recall |   Precision |
|:-------------------------|:-----------------------|-----------:|---------:|------------:|
| 1. Naive (No Preproc)    | 0.6195 ± 0.0538        |     0.6599 |   0.5959 |      0.7465 |
| 2. + Scaling (Robust)    | 0.6569 ± 0.0464        |     0.7238 |   0.6191 |      0.8756 |
| 3. + Feature Eng (Hour)  | 0.6576 ± 0.0444        |     0.7249 |   0.6195 |      0.8778 |
| 4. + RUS (Calibrated)    | 0.6978 ± 0.0669        |     0.5995 |   0.7967 |      0.4988 |
| 5. + Optimization (Full) | 0.6052 ± 0.0515        |     0.6251 |   0.5946 |      0.6721 |

In [2]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
# This prints every single tunable parameter
print(clf.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [3]:
print(60*60*24)

86400


In [1]:
search.best_params


NameError: name 'search' is not defined