<a href="https://colab.research.google.com/github/abinayanand7896-cloud/Abinaya_Anand/blob/main/data_augmentation_imbalanced_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Importing Required Libraries**

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN

# CTGAN
try:
    from ctgan import CTGAN
except ImportError:
    print("Installing CTGAN...")
    !pip install ctgan
    from ctgan import CTGAN

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("All libraries imported successfully!")

All libraries imported successfully!


# **2. Preparing and loading datasets**

**2.1. Dataset 1: Breast Cancer**

In [13]:
# Load Breast Cancer dataset
print(" LOADING DATASET 1: BREAST CANCER")

df_breast = pd.read_csv('/content/Breast_Cancer.csv')

print(f"\nDataset Shape: {df_breast.shape}")
print(f"\nColumns: {df_breast.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_breast.head())
print(f"\nMissing values:\n{df_breast.isnull().sum().sum()} total")

# Prepare features and target
df_breast_clean = df_breast.copy()

# Using 'Status' as target: Alive (0) vs Dead (1)
df_breast_clean['target'] = (df_breast_clean['Status'] == 'Dead').astype(int)

print(f"\nTarget distribution (Status):")
print(df_breast_clean['target'].value_counts())
print(f"Original imbalance ratio: 1:{df_breast_clean['target'].value_counts()[0] / df_breast_clean['target'].value_counts()[1]:.2f}")

# Drop target column from features
X_breast = df_breast_clean.drop(['Status', 'target'], axis=1)
y_breast = df_breast_clean['target']

# Encode categorical variables
categorical_cols = X_breast.select_dtypes(include=['object']).columns
print(f"\nCategorical columns: {list(categorical_cols)}")

X_breast_encoded = pd.get_dummies(X_breast, columns=categorical_cols, drop_first=True)

print(f"\n Breast Cancer Dataset")
print(f"  Features: {X_breast_encoded.shape[1]}")
print(f"  Samples: {len(y_breast)}")
print(f"  Positive class (Dead): {(y_breast == 1).sum()}")
print(f"  Negative class (Alive): {(y_breast == 0).sum()}")

 LOADING DATASET 1: BREAST CANCER

Dataset Shape: (4024, 16)

Columns: ['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage', 'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status', 'Progesterone Status', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Status']

First few rows:
   Age   Race Marital Status T Stage  N Stage 6th Stage  \
0   68  White        Married       T1      N1       IIA   
1   50  White        Married       T2      N2      IIIA   
2   58  White       Divorced       T3      N3      IIIC   
3   58  White        Married       T1      N1       IIA   
4   47  White        Married       T2      N1       IIB   

               differentiate Grade   A Stage  Tumor Size Estrogen Status  \
0      Poorly differentiated     3  Regional           4        Positive   
1  Moderately differentiated     2  Regional          35        Positive   
2  Moderately differentiated     2  Regional          63        Positive   
3      Poo

**2.2. Dataset 2: Credit card fraud**

In [14]:
# Load Credit Card Fraud dataset
print(" LOADING DATASET 2: CREDIT CARD FRAUD")

df_credit = pd.read_csv('/content/creditcard.csv')

print(f"\nDataset Shape: {df_credit.shape}")
print(f"\nColumns: {df_credit.columns.tolist()}")
print(f"\nFirst few rows:")
print(df_credit.head())
print(f"\nMissing values:\n{df_credit.isnull().sum().sum()} total")

# Prepare features and target
# Assuming 'Class' column is the target (0 = legitimate, 1 = fraud)
if 'Class' in df_credit.columns:
    target_col = 'Class'
elif 'class' in df_credit.columns:
    target_col = 'class'
else:
    # If Class column doesn't exist, use the last column
    target_col = df_credit.columns[-1]
    print(f"\nNote: Using '{target_col}' as target column")

X_credit = df_credit.drop(target_col, axis=1)
y_credit = df_credit[target_col]

print(f"\nTarget distribution ({target_col}):")
print(y_credit.value_counts())
if len(y_credit.value_counts()) > 1:
    print(f"Original imbalance ratio: 1:{y_credit.value_counts()[0] / y_credit.value_counts()[1]:.2f}")

# Handle categorical columns if any
categorical_cols = X_credit.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    print(f"\nCategorical columns: {list(categorical_cols)}")
    X_credit_encoded = pd.get_dummies(X_credit, columns=categorical_cols, drop_first=True)
else:
    X_credit_encoded = X_credit.copy()
    print("\nNo categorical columns found - all features are numeric")

print(f"\n Credit Card Dataset")
print(f"  Features: {X_credit_encoded.shape[1]}")
print(f"  Samples: {len(y_credit)}")
print(f"  Positive class (Fraud): {(y_credit == 1).sum()}")
print(f"  Negative class (Legitimate): {(y_credit == 0).sum()}")

 LOADING DATASET 2: CREDIT CARD FRAUD

Dataset Shape: (284807, 31)

Columns: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']

First few rows:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.6386

# **3. Helper Functions**

In [15]:
def create_imbalanced_dataset(X, y, ratio, minority_class=1, random_state=42):
    """
    Create an imbalanced dataset with specified ratio.

    Parameters:
    -----------
    X : array-like
        Feature matrix
    y : array-like
        Target vector
    ratio : int
        Imbalance ratio (e.g., 10 for 1:10, 100 for 1:100)
    minority_class : int
        Label of minority class (default=1)
    random_state : int
        Random seed

    Returns:
    --------
    X_imb, y_imb : Imbalanced dataset
    """
    np.random.seed(random_state)

    # Separate majority and minority classes
    minority_mask = y == minority_class
    majority_mask = ~minority_mask

    X_minority = X[minority_mask]
    y_minority = y[minority_mask]

    X_majority = X[majority_mask]
    y_majority = y[majority_mask]

    # Calculate desired number of minority samples
    n_minority_desired = max(50, int(len(y_minority) * 0.3))  # At least 50 samples
    n_majority_desired = n_minority_desired * ratio

    # Sample minority class
    if len(y_minority) > n_minority_desired:
        minority_indices = np.random.choice(
            len(y_minority),
            size=n_minority_desired,
            replace=False
        )
    else:
        minority_indices = np.arange(len(y_minority))
        n_minority_desired = len(y_minority)
        n_majority_desired = n_minority_desired * ratio

    # Sample majority class
    if len(y_majority) > n_majority_desired:
        majority_indices = np.random.choice(
            len(y_majority),
            size=n_majority_desired,
            replace=False
        )
    else:
        # If not enough majority samples, sample with replacement
        majority_indices = np.random.choice(
            len(y_majority),
            size=n_majority_desired,
            replace=True
        )

    # Combine minority and majority
    if isinstance(X, pd.DataFrame):
        X_minority_sampled = X_minority.iloc[minority_indices]
        y_minority_sampled = y_minority.iloc[minority_indices]

        X_majority_sampled = X_majority.iloc[majority_indices]
        y_majority_sampled = y_majority.iloc[majority_indices]

        X_imb = pd.concat([X_minority_sampled, X_majority_sampled], axis=0)
        y_imb = pd.concat([y_minority_sampled, y_majority_sampled], axis=0)
    else:
        X_minority_sampled = X_minority[minority_indices]
        y_minority_sampled = y_minority[minority_indices]

        X_majority_sampled = X_majority[majority_indices]
        y_majority_sampled = y_majority[majority_indices]

        X_imb = np.vstack([X_minority_sampled, X_majority_sampled])
        y_imb = np.concatenate([y_minority_sampled, y_majority_sampled])

    # Shuffle
    shuffle_idx = np.random.permutation(len(y_imb))
    if isinstance(X_imb, pd.DataFrame):
        X_imb = X_imb.iloc[shuffle_idx].reset_index(drop=True)
        y_imb = y_imb.iloc[shuffle_idx].reset_index(drop=True)
    else:
        X_imb = X_imb[shuffle_idx]
        y_imb = y_imb[shuffle_idx]

    actual_ratio = (y_imb == 0).sum() / max((y_imb == 1).sum(), 1)
    print(f"Created dataset with ratio 1:{actual_ratio:.2f}")
    print(f"Minority class: {(y_imb == 1).sum()} samples")
    print(f"Majority class: {(y_imb == 0).sum()} samples")
    print(f"Total samples: {len(y_imb)}")

    return X_imb, y_imb

In [16]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name="Model"):
    """
    Train and evaluate a model, returning comprehensive metrics.
    """
    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else y_pred

    # Calculate metrics
    metrics = {
        'model': model_name,
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'auc_roc': roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else 0.0
    }

    return metrics

def print_metrics(metrics_dict):
    """
    Print metrics in a formatted table.
    """
    df_metrics = pd.DataFrame(metrics_dict)
    print(df_metrics.to_string(index=False))
    return df_metrics

# **4. Main Experiment**

In [17]:
def run_experiment(X, y, ratio, variation_params, experiment_name, dataset_name):
    """
    Run complete experiment for a given imbalance ratio with 5 variations.
    """
    print("="*100)
    print(f" {experiment_name} | {dataset_name}")
    print("="*100)

    # Create imbalanced dataset
    print(f"\n Creating 1:{ratio} imbalanced dataset...\n")
    X_imb, y_imb = create_imbalanced_dataset(X, y, ratio=ratio, random_state=RANDOM_STATE)

    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X_imb, y_imb, test_size=0.3, random_state=RANDOM_STATE, stratify=y_imb
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print(f"\nTrain set: {X_train_scaled.shape}")
    print(f"Test set: {X_test_scaled.shape}")
    print(f"Train class distribution: {np.bincount(y_train)}")
    print(f"Test class distribution: {np.bincount(y_test)}")

    # Store all results
    all_results = []

    # Run 5 variations
    for var_idx, params in enumerate(variation_params, 1):
        print("\n" + "="*100)
        print(f"ðŸ”¬ VARIATION {var_idx}/{len(variation_params)}: {params['name']}")
        print("="*100)
        print(f"Parameters: SMOTE_k={params['smote_k']}, ADASYN_k={params['adasyn_k']}, CTGAN_epochs={params['ctgan_epochs']}")

        variation_results = []

        # ------------------------
        # 1. BASELINE (No augmentation)
        # ------------------------
        print("\n[1/4] Running BASELINE...")
        for clf_name, clf in params['classifiers'].items():
            metrics = evaluate_model(
                clf, X_train_scaled, y_train, X_test_scaled, y_test,
                model_name=f"Baseline-{clf_name}"
            )
            metrics['technique'] = 'Baseline'
            metrics['variation'] = var_idx
            variation_results.append(metrics)

        # ------------------------
        # 2. SMOTE
        # ------------------------
        print("\n[2/4] Running SMOTE...")
        try:
            # Adjust k_neighbors if needed
            k_neighbors = min(params['smote_k'], (y_train == 1).sum() - 1)
            if k_neighbors < 1:
                k_neighbors = 1

            smote = SMOTE(
                k_neighbors=k_neighbors,
                random_state=RANDOM_STATE
            )
            X_smote, y_smote = smote.fit_resample(X_train_scaled, y_train)
            print(f"SMOTE generated: {X_smote.shape[0]} samples (k_neighbors={k_neighbors})")

            for clf_name, clf in params['classifiers'].items():
                metrics = evaluate_model(
                    clf, X_smote, y_smote, X_test_scaled, y_test,
                    model_name=f"SMOTE-{clf_name}"
                )
                metrics['technique'] = 'SMOTE'
                metrics['variation'] = var_idx
                variation_results.append(metrics)
        except Exception as e:
            print(f"SMOTE failed: {e}")

        # ------------------------
        # 3. ADASYN
        # ------------------------
        print("\n[3/4] Running ADASYN...")
        try:
            # Adjust n_neighbors if needed
            n_neighbors = min(params['adasyn_k'], (y_train == 1).sum() - 1)
            if n_neighbors < 1:
                n_neighbors = 1

            adasyn = ADASYN(
                n_neighbors=n_neighbors,
                random_state=RANDOM_STATE
            )
            X_adasyn, y_adasyn = adasyn.fit_resample(X_train_scaled, y_train)
            print(f"ADASYN generated: {X_adasyn.shape[0]} samples (n_neighbors={n_neighbors})")

            for clf_name, clf in params['classifiers'].items():
                metrics = evaluate_model(
                    clf, X_adasyn, y_adasyn, X_test_scaled, y_test,
                    model_name=f"ADASYN-{clf_name}"
                )
                metrics['technique'] = 'ADASYN'
                metrics['variation'] = var_idx
                variation_results.append(metrics)
        except Exception as e:
            print(f"ADASYN failed: {e}")

        # ------------------------
        # 4. CTGAN
        # ------------------------
        print("\n[4/4] Running CTGAN...")
        try:
            # Prepare data for CTGAN (needs original un-scaled data)
            train_df = X_train.copy()
            if isinstance(train_df, np.ndarray):
                train_df = pd.DataFrame(train_df)
            train_df['target'] = y_train.values if hasattr(y_train, 'values') else y_train

            minority_data = train_df[train_df['target'] == 1].drop('target', axis=1)

            if len(minority_data) > 0:
                # Train CTGAN
                ctgan = CTGAN(epochs=params['ctgan_epochs'], verbose=False)
                ctgan.fit(minority_data, discrete_columns=[])

                # Generate synthetic samples
                n_majority = (y_train == 0).sum()
                n_minority = (y_train == 1).sum()
                n_synthetic = n_majority - n_minority

                if n_synthetic > 0:
                    synthetic_data = ctgan.sample(n_synthetic)

                    # Combine original + synthetic
                    X_ctgan = pd.concat([
                        train_df.drop('target', axis=1),
                        synthetic_data
                    ], ignore_index=True)

                    y_ctgan = np.concatenate([
                        y_train.values if hasattr(y_train, 'values') else y_train,
                        np.ones(n_synthetic, dtype=int)
                    ])

                    # Scale
                    X_ctgan_scaled = scaler.transform(X_ctgan)

                    print(f"CTGAN generated: {X_ctgan_scaled.shape[0]} samples")

                    for clf_name, clf in params['classifiers'].items():
                        metrics = evaluate_model(
                            clf, X_ctgan_scaled, y_ctgan, X_test_scaled, y_test,
                            model_name=f"CTGAN-{clf_name}"
                        )
                        metrics['technique'] = 'CTGAN'
                        metrics['variation'] = var_idx
                        variation_results.append(metrics)
                else:
                    print("CTGAN skipped: Already balanced")
            else:
                print("CTGAN skipped: No minority samples")
        except Exception as e:
            print(f"CTGAN failed: {e}")

        # Print variation results
        print(f"\n Variation {var_idx} Results:")
        print("-" * 100)
        df_var = print_metrics(variation_results)

        all_results.extend(variation_results)

    # Convert to DataFrame
    df_results = pd.DataFrame(all_results)
    df_results['dataset'] = dataset_name

    # Print summary
    print("\n" + "="*100)
    print(f" {experiment_name} - {dataset_name} - SUMMARY")
    print("="*100)

    if len(df_results) > 0:
        summary = df_results.groupby('technique')[[ 'precision', 'recall', 'f1', 'auc_roc']].agg(['mean', 'std'])
        print(summary)

    return df_results

# **5. Defining 5 variations**

In [18]:
# Define 5 variations with different parameters
variations = [
    # Variation 1: Conservative sampling + Simple models
    {
        'name': 'Conservative-Simple',
        'smote_k': 3,
        'adasyn_k': 3,
        'ctgan_epochs': 100,
        'classifiers': {
            'LogReg': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
            'RF': RandomForestClassifier(n_estimators=50, max_depth=5, random_state=RANDOM_STATE)
        }
    },

    # Variation 2: Moderate sampling + Balanced models
    {
        'name': 'Moderate-Balanced',
        'smote_k': 5,
        'adasyn_k': 5,
        'ctgan_epochs': 150,
        'classifiers': {
            'LogReg': LogisticRegression(max_iter=1000, C=0.5, random_state=RANDOM_STATE),
            'XGBoost': XGBClassifier(n_estimators=100, max_depth=3, random_state=RANDOM_STATE, eval_metric='logloss')
        }
    },

    # Variation 3: Aggressive sampling + Complex models
    {
        'name': 'Aggressive-Complex',
        'smote_k': 7,
        'adasyn_k': 7,
        'ctgan_epochs': 200,
        'classifiers': {
            'RF': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=RANDOM_STATE),
            'XGBoost': XGBClassifier(n_estimators=150, max_depth=5, random_state=RANDOM_STATE, eval_metric='logloss')
        }
    },

    # Variation 4: High neighbors + Deep models
    {
        'name': 'HighK-Deep',
        'smote_k': 10,
        'adasyn_k': 10,
        'ctgan_epochs': 250,
        'classifiers': {
            'RF': RandomForestClassifier(n_estimators=200, max_depth=15, random_state=RANDOM_STATE),
            'XGBoost': XGBClassifier(n_estimators=200, max_depth=7, learning_rate=0.1, random_state=RANDOM_STATE, eval_metric='logloss')
        }
    },

    # Variation 5: Mixed approach with regularization
    {
        'name': 'Mixed-Regularized',
        'smote_k': 5,
        'adasyn_k': 8,
        'ctgan_epochs': 300,
        'classifiers': {
            'LogReg': LogisticRegression(max_iter=1000, C=0.1, penalty='l2', random_state=RANDOM_STATE),
            'RF': RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_split=10, random_state=RANDOM_STATE)
        }
    }
]

print("5 Variations defined:")
for i, var in enumerate(variations, 1):
    print(f"  {i}. {var['name']}: SMOTE_k={var['smote_k']}, ADASYN_k={var['adasyn_k']}, CTGAN_epochs={var['ctgan_epochs']}")

5 Variations defined:
  1. Conservative-Simple: SMOTE_k=3, ADASYN_k=3, CTGAN_epochs=100
  2. Moderate-Balanced: SMOTE_k=5, ADASYN_k=5, CTGAN_epochs=150
  3. Aggressive-Complex: SMOTE_k=7, ADASYN_k=7, CTGAN_epochs=200
  4. HighK-Deep: SMOTE_k=10, ADASYN_k=10, CTGAN_epochs=250
  5. Mixed-Regularized: SMOTE_k=5, ADASYN_k=8, CTGAN_epochs=300


# **6. Breast Cancer Dataset Experiments**

**6.1. Breast Cancer - Moderate Imbalance (1:10)**

In [19]:
results_breast_exp1 = run_experiment(
    X=X_breast_encoded,
    y=y_breast,
    ratio=10,
    variation_params=variations,
    experiment_name="EXPERIMENT 1: Moderate Imbalance (1:10)",
    dataset_name="Breast Cancer"
)

results_breast_exp1.to_csv('breast_cancer_exp1_1to10.csv', index=False)
print("\n Results saved to 'breast_cancer_exp1_1to10.csv'")

 EXPERIMENT 1: Moderate Imbalance (1:10) | Breast Cancer

 Creating 1:10 imbalanced dataset...

Created dataset with ratio 1:10.00
Minority class: 184 samples
Majority class: 1840 samples
Total samples: 2024

Train set: (1416, 29)
Test set: (608, 29)
Train class distribution: [1287  129]
Test class distribution: [553  55]

ðŸ”¬ VARIATION 1/5: Conservative-Simple
Parameters: SMOTE_k=3, ADASYN_k=3, CTGAN_epochs=100

[1/4] Running BASELINE...

[2/4] Running SMOTE...
SMOTE generated: 2574 samples (k_neighbors=3)

[3/4] Running ADASYN...
ADASYN generated: 2612 samples (n_neighbors=3)

[4/4] Running CTGAN...
CTGAN generated: 2574 samples

 Variation 1 Results:
----------------------------------------------------------------------------------------------------
          model  precision   recall       f1  auc_roc technique  variation
Baseline-LogReg   0.787879 0.472727 0.590909 0.863094  Baseline          1
    Baseline-RF   0.916667 0.200000 0.328358 0.870656  Baseline          1
   SMOTE-Lo

**6.2. Breast Cancer - High Imbalance (1:100)**

In [20]:
results_breast_exp2 = run_experiment(
    X=X_breast_encoded,
    y=y_breast,
    ratio=100,
    variation_params=variations,
    experiment_name="EXPERIMENT 2: High Imbalance (1:100)",
    dataset_name="Breast Cancer"
)

results_breast_exp2.to_csv('breast_cancer_exp2_1to100.csv', index=False)
print("\nâœ“ Results saved to 'breast_cancer_exp2_1to100.csv'")

 EXPERIMENT 2: High Imbalance (1:100) | Breast Cancer

 Creating 1:100 imbalanced dataset...

Created dataset with ratio 1:100.00
Minority class: 184 samples
Majority class: 18400 samples
Total samples: 18584

Train set: (13008, 29)
Test set: (5576, 29)
Train class distribution: [12879   129]
Test class distribution: [5521   55]

ðŸ”¬ VARIATION 1/5: Conservative-Simple
Parameters: SMOTE_k=3, ADASYN_k=3, CTGAN_epochs=100

[1/4] Running BASELINE...

[2/4] Running SMOTE...
SMOTE generated: 25758 samples (k_neighbors=3)

[3/4] Running ADASYN...
ADASYN generated: 25788 samples (n_neighbors=3)

[4/4] Running CTGAN...
CTGAN generated: 25758 samples

 Variation 1 Results:
----------------------------------------------------------------------------------------------------
          model  precision   recall       f1  auc_roc technique  variation
Baseline-LogReg   1.000000 0.127273 0.225806 0.838880  Baseline          1
    Baseline-RF   1.000000 0.036364 0.070175 0.839726  Baseline          1
 

# **7. Credit Card Dataset Experiments**

**7.1. Credit Card - Moderate Imbalance (1:10)**

In [21]:
results_credit_exp1 = run_experiment(
    X=X_credit_encoded,
    y=y_credit,
    ratio=10,
    variation_params=variations,
    experiment_name="EXPERIMENT 1: Moderate Imbalance (1:10)",
    dataset_name="Credit Card"
)

results_credit_exp1.to_csv('credit_card_exp1_1to10.csv', index=False)
print("\nâœ“ Results saved to 'credit_card_exp1_1to10.csv'")

 EXPERIMENT 1: Moderate Imbalance (1:10) | Credit Card

 Creating 1:10 imbalanced dataset...

Created dataset with ratio 1:10.00
Minority class: 147 samples
Majority class: 1470 samples
Total samples: 1617

Train set: (1131, 30)
Test set: (486, 30)
Train class distribution: [1028  103]
Test class distribution: [442  44]

ðŸ”¬ VARIATION 1/5: Conservative-Simple
Parameters: SMOTE_k=3, ADASYN_k=3, CTGAN_epochs=100

[1/4] Running BASELINE...

[2/4] Running SMOTE...
SMOTE generated: 2056 samples (k_neighbors=3)

[3/4] Running ADASYN...
ADASYN generated: 2051 samples (n_neighbors=3)

[4/4] Running CTGAN...
CTGAN generated: 2056 samples

 Variation 1 Results:
----------------------------------------------------------------------------------------------------
          model  precision   recall       f1  auc_roc technique  variation
Baseline-LogReg   0.950000 0.863636 0.904762 0.955522  Baseline          1
    Baseline-RF   1.000000 0.840909 0.913580 0.951897  Baseline          1
   SMOTE-LogR

**7.2. Credit Card - High Imbalance (1:100)**

In [22]:
results_credit_exp2 = run_experiment(
    X=X_credit_encoded,
    y=y_credit,
    ratio=100,
    variation_params=variations,
    experiment_name="EXPERIMENT 2: High Imbalance (1:100)",
    dataset_name="Credit Card"
)

results_credit_exp2.to_csv('credit_card_exp2_1to100.csv', index=False)
print("\nâœ“ Results saved to 'credit_card_exp2_1to100.csv'")

 EXPERIMENT 2: High Imbalance (1:100) | Credit Card

 Creating 1:100 imbalanced dataset...

Created dataset with ratio 1:100.00
Minority class: 147 samples
Majority class: 14700 samples
Total samples: 14847

Train set: (10392, 30)
Test set: (4455, 30)
Train class distribution: [10289   103]
Test class distribution: [4411   44]

ðŸ”¬ VARIATION 1/5: Conservative-Simple
Parameters: SMOTE_k=3, ADASYN_k=3, CTGAN_epochs=100

[1/4] Running BASELINE...

[2/4] Running SMOTE...
SMOTE generated: 20578 samples (k_neighbors=3)

[3/4] Running ADASYN...
ADASYN generated: 20577 samples (n_neighbors=3)

[4/4] Running CTGAN...
CTGAN generated: 20578 samples

 Variation 1 Results:
----------------------------------------------------------------------------------------------------
          model  precision   recall       f1  auc_roc technique  variation
Baseline-LogReg   0.973684 0.840909 0.902439 0.997841  Baseline          1
    Baseline-RF   1.000000 0.818182 0.900000 0.990316  Baseline          1
   

# **8. Comprehensive Analysis - Both datasets**

In [24]:
# Combine all results from both datasets
results_breast_exp1['experiment'] = '1:10'
results_breast_exp2['experiment'] = '1:100'

results_credit_exp1['experiment'] = '1:10'
results_credit_exp2['experiment'] = '1:100'

all_results = pd.concat([
    results_breast_exp1, results_breast_exp2,
    results_credit_exp1, results_credit_exp2
], ignore_index=True)

all_results.to_csv('ALL_EXPERIMENTS_COMBINED.csv', index=False)


print(" ALL RESULTS COMBINED")
print("="*100)
print(f"\n All results saved to 'ALL_EXPERIMENTS_COMBINED.csv'")
print(f"\nTotal experiments run: {len(all_results)}")
print(f"\nBreakdown:")
print(f"  Breast Cancer experiments: {len(results_breast_exp1) + len(results_breast_exp2)}")
print(f"  Credit Card experiments: {len(results_credit_exp1) + len(results_credit_exp2)}")

 ALL RESULTS COMBINED

 All results saved to 'ALL_EXPERIMENTS_COMBINED.csv'

Total experiments run: 160

Breakdown:
  Breast Cancer experiments: 80
  Credit Card experiments: 80


In [25]:
# Statistical Summary by Dataset and Technique
print("\n" + "="*100)
print(" OVERALL STATISTICAL SUMMARY - BY DATASET AND TECHNIQUE")
print("="*100)

summary_by_dataset_technique = all_results.groupby(['dataset', 'experiment', 'technique']).agg({
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std'],
    'auc_roc': ['mean', 'std']
}).round(4)

print(summary_by_dataset_technique)


 OVERALL STATISTICAL SUMMARY - BY DATASET AND TECHNIQUE
                                   precision          recall              f1  \
                                        mean     std    mean     std    mean   
dataset       experiment technique                                             
Breast Cancer 1:10       ADASYN       0.4825  0.1548  0.6345  0.0884  0.5222   
                         Baseline     0.7719  0.0654  0.4236  0.1007  0.5357   
                         CTGAN        0.7809  0.0639  0.4273  0.1040  0.5407   
                         SMOTE        0.4796  0.1396  0.6327  0.0746  0.5251   
              1:100      ADASYN       0.3560  0.3714  0.5109  0.1520  0.2843   
                         Baseline     0.9699  0.0484  0.1818  0.1121  0.2903   
                         CTGAN        0.8718  0.3095  0.1764  0.1314  0.2772   
                         SMOTE        0.3628  0.3900  0.5073  0.1481  0.2803   
Credit Card   1:10       ADASYN       0.7854  0.1267  0.9023  0

In [26]:
# Best performers for each dataset and experiment
print("\n" + "="*100)
print(" BEST PERFORMING TECHNIQUES (by F1-Score)")
print("="*100)

for dataset in ['Breast Cancer', 'Credit Card']:
    print(f"\n{'='*100}")
    print(f" {dataset.upper()}")
    print(f"{'='*100}")

    dataset_results = all_results[all_results['dataset'] == dataset]

    for exp in ['1:10', '1:100', '1:1000']:
        exp_data = dataset_results[dataset_results['experiment'] == exp]
        if len(exp_data) > 0:
            best_idx = exp_data['f1'].idxmax()
            best_row = exp_data.loc[best_idx]
            print(f"\n  {exp}:")
            print(f"    Best: {best_row['technique']} - {best_row['model']}")
            print(f"    F1: {best_row['f1']:.4f} | Recall: {best_row['recall']:.4f} | AUC-ROC: {best_row['auc_roc']:.4f}")


 BEST PERFORMING TECHNIQUES (by F1-Score)

 BREAST CANCER

  1:10:
    Best: CTGAN - CTGAN-XGBoost
    F1: 0.6458 | Recall: 0.5636 | AUC-ROC: 0.8745

  1:100:
    Best: ADASYN - ADASYN-XGBoost
    F1: 0.5750 | Recall: 0.4182 | AUC-ROC: 0.8622

 CREDIT CARD

  1:10:
    Best: CTGAN - CTGAN-LogReg
    F1: 0.9286 | Recall: 0.8864 | AUC-ROC: 0.9575

  1:100:
    Best: SMOTE - SMOTE-RF
    F1: 0.9398 | Recall: 0.8864 | AUC-ROC: 0.9949
