In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from imblearn.over_sampling import SMOTE

# Load the dataset
dataset = pd.read_csv('/Creditcard_data.csv')

# Analyze class distribution
class_counts = dataset['Class'].value_counts()
print(f"Class distribution before balancing:\n{class_counts}\n")

# Separate features and target
X = dataset.drop(columns=['Class'])
y = dataset['Class']

# Apply SMOTE to address imbalance
oversampler = SMOTE(random_state=101)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Convert resampled data to DataFrame
balanced_data = pd.DataFrame(X_resampled, columns=X.columns)
balanced_data['Class'] = y_resampled

# Define fractions for subset sampling
subset_ratios = [0.9, 0.7, 0.5, 0.3, 0.1]

# Generate sampled datasets
sampled_datasets = [balanced_data.sample(frac=ratio, random_state=101) for ratio in subset_ratios]

# Define a collection of models
models_to_test = {
    'GradientBoosting': GradientBoostingClassifier(random_state=101),
    'NaiveBayes': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(random_state=101)
}

# Initialize list to store results
model_performance = []

# Evaluate each model on each subset
for sample_idx, sampled_data in enumerate(sampled_datasets, start=1):
    X_sample = sampled_data.drop(columns=['Class'])
    y_sample = sampled_data['Class']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.25, random_state=101)

    for model_name, model in models_to_test.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        predictions = model.predict(X_test)

        # Calculate R^2 score
        r2_value = r2_score(y_test, predictions)

        # Append results
        model_performance.append({
            'Subset': f'Subset{sample_idx}',
            'Model': model_name,
            'R2_Score': r2_value
        })

# Convert results into a DataFrame for analysis
performance_df = pd.DataFrame(model_performance)

# Print the evaluation results
print("Model Evaluation Results:\n")
print(performance_df)

# Identify the best model and subset combination for each model
best_configurations = performance_df.loc[performance_df.groupby('Model')['R2_Score'].idxmax()]
print("\nBest Configurations by Model:\n")
print(best_configurations)
