In [1]:
import pandas as pd
from imblearn.combine import SMOTETomek
from collections import Counter

# Load the data
data = pd.read_csv("data_cln.csv")

# Separate features and target variable
X = data.drop(columns=['Outcome'])
y = data['Outcome']

# Define the target ratios for class imbalance
ratios = [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]

# Number of minority samples
n_minority_samples = Counter(y)[1]

# Resample and save datasets
for ratio in ratios:
    # Calculate the number of majority samples to achieve the desired ratio
    n_majority_samples = int(n_minority_samples / ratio)
    
    # Define the sampling strategy
    sampling_strategy = {0: n_majority_samples, 1: n_minority_samples}
    
    # Apply SMOTETomek
    smote_tomek = SMOTETomek(sampling_strategy=sampling_strategy, random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
    
    # Combine resampled features and target
    data_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['Outcome'])], axis=1)
    
    # Save to CSV
    file_name = f"data_resampled_{int(ratio*100)}.csv"
    data_resampled.to_csv(file_name, index=False)
    print(f"Saved resampled data with {int(ratio*100)}% imbalance to {file_name}")
    
    # Print class distribution
    class_distribution = Counter(y_resampled)
    print(f"Class distribution for {int(ratio*100)}% imbalance: {class_distribution}")


Saved resampled data with 1% imbalance to data_resampled_1.csv
Class distribution for 1% imbalance: Counter({0: 25199, 1: 251})
Saved resampled data with 5% imbalance to data_resampled_5.csv
Class distribution for 5% imbalance: Counter({0: 5037, 1: 249})
Saved resampled data with 10% imbalance to data_resampled_10.csv
Class distribution for 10% imbalance: Counter({0: 2516, 1: 248})
Saved resampled data with 20% imbalance to data_resampled_20.csv
Class distribution for 20% imbalance: Counter({0: 1259, 1: 251})
Saved resampled data with 30% imbalance to data_resampled_30.csv
Class distribution for 30% imbalance: Counter({0: 832, 1: 244})
Saved resampled data with 40% imbalance to data_resampled_40.csv
Class distribution for 40% imbalance: Counter({0: 611, 1: 233})
Saved resampled data with 50% imbalance to data_resampled_50.csv
Class distribution for 50% imbalance: Counter({0: 478, 1: 226})
