In [13]:


import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Load the dataset
url = "/content/Creditcard_data.csv"
data = pd.read_csv(url)

# Check class imbalance
print(data['Class'].value_counts())

# Separate features and labels
X = data.drop(columns=['Class'])
y = data['Class']

# Apply SMOTE (Oversampling)
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Verify balanced dataset
print(y_balanced.value_counts())


Class
0    763
1      9
Name: count, dtype: int64
Class
0    763
1    763
Name: count, dtype: int64


In [7]:
# Example of creating samples with different sizes
sample_sizes = [0.2, 0.4, 0.6, 0.8, None]  # Using None for 100% of the data
samples = []

for size in sample_sizes:
    if size is None:
        # Use the entire dataset
        samples.append((X_balanced, y_balanced))
    else:
        # Split dataset
        sample_X, _, sample_y, _ = train_test_split(X_balanced, y_balanced, train_size=size, random_state=42)
        samples.append((sample_X, sample_y))


In [8]:
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Initialize sampling techniques
sampling_methods = {
    "RandomOversampling": RandomOverSampler(),
    "SMOTE": SMOTE(),
    "RandomUndersampling": RandomUnderSampler(),
    "NearMiss": NearMiss(),
}

# Apply sampling techniques
sampled_data = {}

for method, sampler in sampling_methods.items():
    X_sampled, y_sampled = sampler.fit_resample(X_balanced, y_balanced)
    sampled_data[method] = (X_sampled, y_sampled)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

models = {
    "M1": RandomForestClassifier(),
    # Add other models like SVM, Logistic Regression, etc., for M2, M3, M4, M5
}

# Train and evaluate models on sampled data
results = {}

for model_name, model in models.items():
    for method, (X_sampled, y_sampled) in sampled_data.items():
        # Split sampled data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

        # Train the model
        model.fit(X_train, y_train)

        # Evaluate accuracy
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        # Store results
        results[(model_name, method)] = acc


In [10]:
# Find the best sampling technique for each model
best_results = {}

for model_name in models.keys():
    best_method = max(
        [(method, acc) for (m, method), acc in results.items() if m == model_name],
        key=lambda x: x[1]
    )
    best_results[model_name] = best_method

# Display best results
for model, (method, acc) in best_results.items():
    print(f"Model {model}: Best Sampling Technique = {method}, Accuracy = {acc:.2f}")


Model M1: Best Sampling Technique = RandomUndersampling, Accuracy = 1.00
