<a href="https://colab.research.google.com/github/abinayanand7896-cloud/Abinaya_Anand/blob/main/data_augmentation_comparison_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Augmentation Techniques Comparison Pipeline

This notebook compares SMOTE, ADASYN, and CTGAN data augmentation techniques on imbalanced classification datasets.

**Datasets:**
1. Breast Cancer (sklearn)
2. Credit Card Fraud (uploaded CSV)

**Imbalance Ratios:** 1:10, 1:100, 1:1000

**Classifiers:** Logistic Regression, Random Forest, XGBoost

**Augmentation Variations:** Default parameters vs Tuned parameters

In [1]:
!pip install scikit-learn imbalanced-learn ctgan sdv xgboost pandas numpy openpyxl

Collecting ctgan
  Downloading ctgan-0.12.1-py3-none-any.whl.metadata (10 kB)
Collecting sdv
  Downloading sdv-1.34.1-py3-none-any.whl.metadata (14 kB)
Collecting rdt>=1.14.0 (from ctgan)
  Downloading rdt-1.20.0-py3-none-any.whl.metadata (11 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.42.54-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.42.54-py3-none-any.whl.metadata (5.9 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.14.1-py3-none-any.whl.metadata (9.7 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.27.1-py3-none-any.whl.metadata (10.0 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.1.0-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3transfer-0.16.0

## Import Dependencies

In [2]:
import pandas as pd
import numpy as np
import os
import json
import warnings
from collections import defaultdict
import zipfile

# ML Libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Augmentation techniques
from imblearn.over_sampling import SMOTE, ADASYN
from ctgan import CTGAN

# Metrics
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

# Google Colab utilities
from google.colab import files
from google.colab import drive

warnings.filterwarnings('ignore')
np.random.seed(42)

## Data Loading and Setup

In [3]:
cancer_df = pd.read_csv('/content/Breast_Cancer.csv')
print(f"Cancer Diagnosis shape: {cancer_df.shape}")

cancer_target = 'Status' if 'Status' in cancer_df.columns else cancer_df.columns[-1]
print(f"Cancer target: '{cancer_target}'")
print(f"Class distribution:\n{cancer_df[cancer_target].value_counts()}")

Cancer Diagnosis shape: (4024, 16)
Cancer target: 'Status'
Class distribution:
Status
Alive    3408
Dead      616
Name: count, dtype: int64


In [4]:
fraud_df = pd.read_csv('/content/creditcard.csv')
print(f"Fraud Detection shape: {fraud_df.shape}")

# Identify target column (last column or 'Class')
fraud_target = 'Class' if 'Class' in fraud_df.columns else fraud_df.columns[-1]
print(f"Fraud target: '{fraud_target}'")
print(f"Class distribution:\n{fraud_df[fraud_target].value_counts()}")

Fraud Detection shape: (15936, 31)
Fraud target: 'Class'
Class distribution:
Class
0.0    15862
1.0       73
Name: count, dtype: int64


## Step 1: Create Imbalanced Datasets

In [5]:
def create_imbalanced_dataset(df, target_col, ratio, dataset_name):
    """
    Create imbalanced dataset with specified minority:majority ratio
    """
    minority_class = df[target_col].value_counts().idxmin()
    majority_class = df[target_col].value_counts().idxmax()

    minority_samples = df[df[target_col] == minority_class]
    majority_samples = df[df[target_col] == majority_class]

    n_minority = len(minority_samples)
    n_majority_target = n_minority * ratio

    print(f"\nCreating {dataset_name} with 1:{ratio} ratio")
    print(f"Minority class ({minority_class}): {n_minority} samples")
    print(f"Target majority samples: {n_majority_target}")

    # Handle different scenarios based on available majority samples
    if len(majority_samples) >= n_majority_target:
        # Subsample majority class
        majority_selected = majority_samples.sample(n=n_majority_target, random_state=42)
        print(f"Subsampled majority class to {len(majority_selected)} samples")
    else:
        # Bootstrap/duplicate majority class (for breast cancer with high ratios)
        n_repeats = int(np.ceil(n_majority_target / len(majority_samples)))
        majority_repeated = pd.concat([majority_samples] * n_repeats, ignore_index=True)
        majority_selected = majority_repeated.sample(n=n_majority_target, random_state=42)
        print(f"Bootstrapped majority class to {len(majority_selected)} samples (repeated {n_repeats} times)")

    # Combine minority and majority samples
    imbalanced_df = pd.concat([minority_samples, majority_selected], ignore_index=True)
    imbalanced_df = imbalanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Final dataset shape: {imbalanced_df.shape}")
    print(f"Final class distribution: {imbalanced_df[target_col].value_counts().to_dict()}")

    return imbalanced_df

# Create imbalanced datasets
ratios = [10, 100]

# Create directory if it doesn't exist
os.makedirs('imbalanced_datasets', exist_ok=True)

print("=" * 50)
print("CREATING BREAST CANCER IMBALANCED DATASETS")
print("=" * 50)

for ratio in ratios:
    imbalanced_bc = create_imbalanced_dataset(cancer_df, 'Status', ratio, f"Breast Cancer 1:{ratio}")
    filename = f"imbalanced_datasets/breast_cancer_1_{ratio}.csv"
    imbalanced_bc.to_csv(filename, index=False)
    print(f"Saved: {filename}")

print("\n" + "=" * 50)
print("CREATING CREDIT CARD IMBALANCED DATASETS")
print("=" * 50)

for ratio in ratios:
    imbalanced_cc = create_imbalanced_dataset(fraud_df, 'Class', ratio, f"Credit Card 1:{ratio}")
    filename = f"imbalanced_datasets/creditcard_1_{ratio}.csv"
    imbalanced_cc.to_csv(filename, index=False)
    print(f"Saved: {filename}")

CREATING BREAST CANCER IMBALANCED DATASETS

Creating Breast Cancer 1:10 with 1:10 ratio
Minority class (Dead): 616 samples
Target majority samples: 6160
Bootstrapped majority class to 6160 samples (repeated 2 times)
Final dataset shape: (6776, 16)
Final class distribution: {'Alive': 6160, 'Dead': 616}
Saved: imbalanced_datasets/breast_cancer_1_10.csv

Creating Breast Cancer 1:100 with 1:100 ratio
Minority class (Dead): 616 samples
Target majority samples: 61600
Bootstrapped majority class to 61600 samples (repeated 19 times)
Final dataset shape: (62216, 16)
Final class distribution: {'Alive': 61600, 'Dead': 616}
Saved: imbalanced_datasets/breast_cancer_1_100.csv

CREATING CREDIT CARD IMBALANCED DATASETS

Creating Credit Card 1:10 with 1:10 ratio
Minority class (1.0): 73 samples
Target majority samples: 730
Subsampled majority class to 730 samples
Final dataset shape: (803, 31)
Final class distribution: {0.0: 730, 1.0: 73}
Saved: imbalanced_datasets/creditcard_1_10.csv

Creating Credit 

## Step 2: Define Fixed Classifiers

In [6]:
def get_classifiers():
    """
    Return fixed classifiers with specified parameters
    """
    return {
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(n_estimators=100, eval_metric='logloss',
                                random_state=42, use_label_encoder=False)
    }

print("Fixed classifiers defined:")
for name, clf in get_classifiers().items():
    print(f"- {name}: {clf}")

Fixed classifiers defined:
- LogisticRegression: LogisticRegression(max_iter=1000, random_state=42)
- RandomForest: RandomForestClassifier(random_state=42)
- XGBoost: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, ...)


## Step 3: Define Augmentation Parameters

In [7]:
def get_augmentation_configs():
    """
    Return augmentation configurations for V1 (Default) and V2 (Tuned)
    """
    return {
        'V1_Default': {
            'SMOTE': {'k_neighbors': 5, 'sampling_strategy': 'auto', 'random_state': 42},
            'ADASYN': {'n_neighbors': 5, 'sampling_strategy': 'auto', 'random_state': 42},
            'CTGAN': {'epochs': 10, 'batch_size': 500}
        },
        'V2_Tuned': {
            'SMOTE': {'k_neighbors': 3, 'sampling_strategy': 0.5, 'random_state': 42},
            'ADASYN': {'n_neighbors': 10, 'sampling_strategy': 0.5, 'random_state': 42},
            'CTGAN': {'epochs': 20, 'batch_size': 100}
        }
    }

configs = get_augmentation_configs()
print("Augmentation configurations:")
for variation, methods in configs.items():
    print(f"\n{variation}:")
    for method, params in methods.items():
        print(f"  {method}: {params}")

Augmentation configurations:

V1_Default:
  SMOTE: {'k_neighbors': 5, 'sampling_strategy': 'auto', 'random_state': 42}
  ADASYN: {'n_neighbors': 5, 'sampling_strategy': 'auto', 'random_state': 42}
  CTGAN: {'epochs': 10, 'batch_size': 500}

V2_Tuned:
  SMOTE: {'k_neighbors': 3, 'sampling_strategy': 0.5, 'random_state': 42}
  ADASYN: {'n_neighbors': 10, 'sampling_strategy': 0.5, 'random_state': 42}
  CTGAN: {'epochs': 20, 'batch_size': 100}


## Step 4: Training and Evaluation Pipeline

In [8]:
def evaluate_model(clf, X_test, y_test):
    """
    Evaluate model and return metrics for minority class (always class 1)
    """
    try:
        y_pred = clf.predict(X_test)
        y_pred_proba = clf.predict_proba(X_test)[:, 1]

        # Verify minority class is 1
        class_counts = np.bincount(y_test)
        minority_class = np.argmin(class_counts)
        if minority_class != 1:
            print(f"Warning: Expected minority class to be 1, but found {minority_class}")

        metrics = {
            'F1': round(f1_score(y_test, y_pred, pos_label=1), 4),
            'Precision': round(precision_score(y_test, y_pred, pos_label=1), 4),
            'Recall': round(recall_score(y_test, y_pred, pos_label=1), 4),
            'AUC-ROC': round(roc_auc_score(y_test, y_pred_proba), 4)
        }
        return metrics
    except Exception as e:
        print(f"Error in evaluation: {str(e)}")
        return {'F1': np.nan, 'Precision': np.nan, 'Recall': np.nan, 'AUC-ROC': np.nan}

def run_experiment(df, target_col, dataset_name, ratio):
    """
    Run complete experiment for one dataset and ratio
    """
    print(f"\n{'='*60}")
    print(f"RUNNING EXPERIMENT: {dataset_name} - Ratio 1:{ratio}")
    print(f"{'='*60}")

    # Prepare features and target
    X = df.drop(target_col, axis=1).values
    y = df[target_col].values

    # Verify minority class is 1
    class_counts = np.bincount(y)
    minority_class = np.argmin(class_counts)
    print(f"Dataset class distribution: {dict(enumerate(class_counts))}")
    print(f"Minority class: {minority_class} ({class_counts[minority_class]} samples)")
    print(f"Majority class: {1-minority_class} ({class_counts[1-minority_class]} samples)")

    if minority_class != 1:
        print(f"ERROR: Expected minority class to be 1, but found {minority_class}")
        print("Please check dataset preprocessing!")

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    print(f"Train shape: {X_train_scaled.shape}, Test shape: {X_test_scaled.shape}")
    print(f"Train class distribution: {np.bincount(y_train)}")
    print(f"Test class distribution: {np.bincount(y_test)}")

    # Initialize results storage
    results = defaultdict(dict)
    classifiers = get_classifiers()
    augmentation_configs = get_augmentation_configs()

    # Baseline (no augmentation) - compute once, use for both variations
    print("\n--- Running Baseline (No Augmentation) ---")
    baseline_results = {}
    for clf_name, clf in classifiers.items():
        print(f"Training {clf_name}...")
        clf.fit(X_train_scaled, y_train)
        metrics = evaluate_model(clf, X_test_scaled, y_test)
        baseline_results[clf_name] = metrics
        print(f"  {clf_name} - F1: {metrics['F1']}, Precision: {metrics['Precision']}, Recall: {metrics['Recall']}, AUC: {metrics['AUC-ROC']}")

    # Store baseline for both variations
    results['V1_Default']['Baseline'] = baseline_results.copy()
    results['V2_Tuned']['Baseline'] = baseline_results.copy()

    # Run augmentation methods for each variation
    for variation_name, aug_config in augmentation_configs.items():
        print(f"\n--- Running {variation_name} ---")

        for aug_method, aug_params in aug_config.items():
            print(f"\nApplying {aug_method} augmentation...")

            # Apply augmentation
            X_train_aug, y_train_aug = apply_augmentation(
                X_train_scaled, y_train, aug_method, aug_params
            )

            print(f"  Augmented training shape: {X_train_aug.shape}")
            print(f"  Augmented class distribution: {np.bincount(y_train_aug)}")

            # Train classifiers on augmented data
            method_results = {}
            for clf_name, clf in classifiers.items():
                print(f"  Training {clf_name}...")
                try:
                    clf.fit(X_train_aug, y_train_aug)
                    metrics = evaluate_model(clf, X_test_scaled, y_test)
                    method_results[clf_name] = metrics
                    print(f"    {clf_name} - F1: {metrics['F1']}, Precision: {metrics['Precision']}, Recall: {metrics['Recall']}, AUC: {metrics['AUC-ROC']}")
                except Exception as e:
                    print(f"    Error training {clf_name}: {str(e)}")
                    method_results[clf_name] = {'F1': np.nan, 'Precision': np.nan, 'Recall': np.nan, 'AUC-ROC': np.nan}

            results[variation_name][aug_method] = method_results

    return results

In [9]:
def apply_augmentation(X_train, y_train, method, params):
    """
    Apply specified augmentation method to training data
    """
    if method == 'SMOTE':
        smote = SMOTE(**params)
        X_res, y_res = smote.fit_resample(X_train, y_train)
    elif method == 'ADASYN':
        adasyn = ADASYN(**params)
        X_res, y_res = adasyn.fit_resample(X_train, y_train)
    elif method == 'CTGAN':
        # CTGAN requires dataframe and specific data types
        # Convert numpy arrays back to DataFrame for CTGAN
        # Assign string column names to prevent TypeError in rdt library
        feature_cols = [f'feature_{i}' for i in range(X_train.shape[1])]
        X_train_df = pd.DataFrame(X_train, columns=feature_cols)
        y_train_df = pd.DataFrame(y_train, columns=['target'])
        # Combine X and y for CTGAN training
        data_for_ctgan = pd.concat([X_train_df, y_train_df], axis=1)

        # Define categorical features for CTGAN (only target in this case)
        categorical_features = ['target']
        discrete_columns = [col for col in categorical_features if col in data_for_ctgan.columns]

        # Ensure target column is treated as categorical by CTGAN
        ctgan_model = CTGAN(epochs=params['epochs'], batch_size=params['batch_size'])
        ctgan_model.fit(data_for_ctgan, discrete_columns=discrete_columns)

        # Generate samples for the minority class
        # Determine how many samples to generate to balance the dataset
        minority_class = y_train_df['target'].value_counts().idxmin()
        majority_class_count = y_train_df['target'].value_counts().max()
        minority_class_count = y_train_df['target'].value_counts().min()

        num_samples_to_generate = majority_class_count - minority_class_count

        # Generate synthetic data with specific conditions if possible (e.g., for minority class)
        # Note: CTGAN's generate method doesn't directly support generating only a specific class easily.
        # A common approach is to oversample the original minority class within the CTGAN training data
        # or filter generated data. For simplicity here, we'll generate and then filter/select.
        # A more robust solution might involve conditional GANs or generating more data than needed and filtering.
        synthetic_data = ctgan_model.sample(num_samples_to_generate)

        # Filter synthetic data to primarily get minority class samples
        # This is an approximation; ideally, CTGAN would be trained with a focus on minority class
        synthetic_minority_samples = synthetic_data[synthetic_data['target'] == minority_class]

        # If not enough minority samples generated, supplement with random samples from all generated
        if len(synthetic_minority_samples) < num_samples_to_generate:
            remaining_needed = num_samples_to_generate - len(synthetic_minority_samples)
            # Take random samples from the rest of synthetic data until target count is met
            # This might not be ideal as it could include majority class samples
            synthetic_minority_samples = pd.concat([
                synthetic_minority_samples,
                synthetic_data[synthetic_data['target'] != minority_class].sample(n=remaining_needed, replace=True, random_state=42)
            ]).reset_index(drop=True)

        # Combine original data with synthetic minority samples
        X_res_df = pd.concat([X_train_df, synthetic_minority_samples.drop(columns=['target'])], ignore_index=True)
        y_res_df = pd.concat([y_train_df, synthetic_minority_samples[['target']]], ignore_index=True)

        X_res = X_res_df.values
        y_res = y_res_df['target'].values
    else:
        raise ValueError(f"Unknown augmentation method: {method}")

    return X_res, y_res

## Step 5: Execute Experiments and Generate Comparison Tables

In [10]:
def create_comparison_table(results, dataset_name, ratio):
    """
    Create comparison table combining V1 and V2 results for one ratio
    """
    methods = ['Baseline', 'SMOTE', 'ADASYN', 'CTGAN']
    classifiers = ['LogisticRegression', 'RandomForest', 'XGBoost']
    metrics = ['F1', 'Precision', 'Recall', 'AUC-ROC']

    # Create multi-index columns
    columns = []
    for variation in ['V1_Default', 'V2_Tuned']:
        for metric in metrics:
            columns.append((variation, metric))

    multi_index = pd.MultiIndex.from_tuples(columns, names=['Variation', 'Metric'])

    # Create row index
    row_index = []
    for method in methods:
        for clf in classifiers:
            row_index.append((method, clf))

    multi_row_index = pd.MultiIndex.from_tuples(row_index, names=['Method', 'Classifier'])

    # Create DataFrame
    comparison_df = pd.DataFrame(index=multi_row_index, columns=multi_index)

    # Fill data
    for variation in ['V1_Default', 'V2_Tuned']:
        for method in methods:
            for clf in classifiers:
                for metric in metrics:
                    try:
                        value = results[variation][method][clf][metric]
                        comparison_df.loc[(method, clf), (variation, metric)] = value
                    except KeyError:
                        comparison_df.loc[(method, clf), (variation, metric)] = np.nan

    return comparison_df

# Run all experiments
all_results = {}
datasets_info = [
    ('breast_cancer', 'Status', 'Breast Cancer'),
    ('creditcard', 'Class', 'Credit Card')
]

for dataset_prefix, target_col, dataset_display_name in datasets_info:
    all_results[dataset_prefix] = {}

    for ratio in [10, 100]:
        # Load imbalanced dataset
        filename = f"imbalanced_datasets/{dataset_prefix}_1_{ratio}.csv"
        df = pd.read_csv(filename)

        # Ensure target column is numerical (0 and 1) and encode categorical features
        if dataset_prefix == 'breast_cancer':
            df[target_col] = df[target_col].map({'Alive': 0, 'Dead': 1}).astype(int)
            # Dynamically identify categorical features (object dtype)
            categorical_cols_to_encode = df.select_dtypes(include='object').columns.tolist()
            # Ensure target_col is not in the list to be encoded if it somehow got there
            if target_col in categorical_cols_to_encode:
                categorical_cols_to_encode.remove(target_col)
            df = pd.get_dummies(df, columns=categorical_cols_to_encode, drop_first=True)
        elif dataset_prefix == 'creditcard':
            df[target_col] = df[target_col].astype(int) # Ensure it's int just in case
            # Credit card data is already numerical, no additional feature encoding needed.

        # Run experiment
        results = run_experiment(df, target_col, dataset_display_name, ratio)
        all_results[dataset_prefix][f"1_{ratio}"] = results

        # Create and display comparison table
        comparison_table = create_comparison_table(results, dataset_display_name, ratio)

        print(f"\n\n{'='*80}")
        print(f"COMPARISON TABLE: {dataset_display_name} - Ratio 1:{ratio}")
        print(f"{'='*80}")
        print(comparison_table.to_string())
        print(f"{'='*80}")


RUNNING EXPERIMENT: Breast Cancer - Ratio 1:10
Dataset class distribution: {0: np.int64(6160), 1: np.int64(616)}
Minority class: 1 (616 samples)
Majority class: 0 (6160 samples)
Train shape: (5420, 29), Test shape: (1356, 29)
Train class distribution: [4927  493]
Test class distribution: [1233  123]

--- Running Baseline (No Augmentation) ---
Training LogisticRegression...
  LogisticRegression - F1: 0.5169, Precision: 0.8364, Recall: 0.374, AUC: 0.8684
Training RandomForest...
  RandomForest - F1: 0.6023, Precision: 1.0, Recall: 0.4309, AUC: 0.9258
Training XGBoost...
  XGBoost - F1: 0.6489, Precision: 0.9385, Recall: 0.4959, AUC: 0.8642

--- Running V1_Default ---

Applying SMOTE augmentation...
  Augmented training shape: (9854, 29)
  Augmented class distribution: [4927 4927]
  Training LogisticRegression...
    LogisticRegression - F1: 0.4099, Precision: 0.2835, Recall: 0.7398, AUC: 0.8603
  Training RandomForest...
    RandomForest - F1: 0.6893, Precision: 0.8554, Recall: 0.5772, 

## Step 6: Save Results

In [14]:
def save_results_to_excel(all_results):
    """
    Save results to Excel files with multiple sheets
    """
    # Create the 'results' directory if it doesn't exist
    os.makedirs('results', exist_ok=True)

    # Save individual dataset results
    for dataset_name, dataset_results in all_results.items():
        filename = f"results/{dataset_name}_results.xlsx"

        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            for ratio_name, results in dataset_results.items():
                ratio_num = ratio_name.split('_')[1]
                dataset_display = 'Breast Cancer' if dataset_name == 'breast_cancer' else 'Credit Card'

                comparison_table = create_comparison_table(results, dataset_display, int(ratio_num))
                comparison_table.to_excel(writer, sheet_name=f"Ratio_1_{ratio_num}")

        print(f"Saved: {filename}")

    # Save combined results
    with pd.ExcelWriter('results/all_results_summary.xlsx', engine='openpyxl') as writer:
        for dataset_name, dataset_results in all_results.items():
            for ratio_name, results in dataset_results.items():
                ratio_num = ratio_name.split('_')[1]
                dataset_display = 'Breast Cancer' if dataset_name == 'breast_cancer' else 'Credit Card'

                comparison_table = create_comparison_table(results, dataset_display, int(ratio_num))
                sheet_name = f"{dataset_name}_1_{ratio_num}"
                comparison_table.to_excel(writer, sheet_name=sheet_name)

    print("Saved: results/all_results_summary.xlsx")

def save_experiment_config():
    """
    Save experiment configuration to JSON
    """
    config = {
        'datasets': ['breast_cancer', 'creditcard'],
        'imbalance_ratios': [10, 100],
        'classifiers': {
            'LogisticRegression': {'max_iter': 1000, 'random_state': 42},
            'RandomForest': {'n_estimators': 100, 'random_state': 42},
            'XGBoost': {'n_estimators': 100, 'eval_metric': 'logloss', 'random_state': 42, 'use_label_encoder': False}
        },
        'augmentation_configs': get_augmentation_configs(),
        'train_test_split': {'test_size': 0.2, 'stratify': True, 'random_state': 42},
        'metrics': ['F1', 'Precision', 'Recall', 'AUC-ROC'],
        'pos_label': 1
    }

    with open('experiment_config.json', 'w') as f:
        json.dump(config, f, indent=2)

    print("Saved: experiment_config.json")

# Save all results
print("\n" + "=" * 60)
print("SAVING RESULTS")
print("=" * 60)

save_results_to_excel(all_results)
save_experiment_config()

print("\nAll results saved successfully!")


SAVING RESULTS
Saved: results/breast_cancer_results.xlsx
Saved: results/creditcard_results.xlsx
Saved: results/all_results_summary.xlsx
Saved: experiment_config.json

All results saved successfully!


## Download Results

In [19]:
from IPython.display import display


results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,V1_Default,V2_Tuned
Baseline,"{'LogisticRegression': {'F1': 0.8667, 'Precisi...","{'LogisticRegression': {'F1': 0.8667, 'Precisi..."
SMOTE,"{'LogisticRegression': {'F1': 0.625, 'Precisio...","{'LogisticRegression': {'F1': 0.6818, 'Precisi..."
ADASYN,"{'LogisticRegression': {'F1': 0.4407, 'Precisi...","{'LogisticRegression': {'F1': 0.625, 'Precisio..."
CTGAN,"{'LogisticRegression': {'F1': 0.7692, 'Precisi...","{'LogisticRegression': {'F1': 0.7692, 'Precisi..."


**Dataset Summary:**
- **Breast Cancer**: 0=alive (majority), 1=dead (minority) - using `status` column
- **Credit Card**: 0=normal (majority), 1=fraud (minority) - using `Class` column