In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from imblearn.over_sampling import SMOTE

# Load the dataset
data = pd.read_csv('/content/Creditcard_data.csv')

# Check for class imbalance
print(data['Class'].value_counts())

# Balance the dataset using SMOTE
X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Create a balanced dataset DataFrame
balanced_data = pd.DataFrame(X_balanced, columns=X.columns)
balanced_data['Class'] = y_balanced

# Define sampling fractions
sampling_fractions = [0.8, 0.6, 0.4, 0.3, 0.2]

# Generate samples
samples = [balanced_data.sample(frac=fraction, random_state=42) for fraction in sampling_fractions]

# Models to evaluate
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=500, random_state=42)
}

# Evaluate R^2 scores for each sampling technique and model
results = []

for i, sample in enumerate(samples, start=1):
    X_sample = sample.drop('Class', axis=1)
    y_sample = sample['Class']

    # Split the sample into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate R^2 score
        r2 = r2_score(y_test, y_pred)
        results.append({
            'Sampling': f'Sampling{i}',
            'Model': model_name,
            'R2': r2
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Print results
print(results_df)

# Determine the best sampling technique for each model
best_results = results_df.loc[results_df.groupby('Model')['R2'].idxmax()]
print("\nBest results for each model:")
print(best_results)


Class
0    763
1      9
Name: count, dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

     Sampling               Model        R2
0   Sampling1        RandomForest  0.989100
1   Sampling1                 SVM -0.329789
2   Sampling1  LogisticRegression  0.651203
3   Sampling2        RandomForest  0.985385
4   Sampling2                 SVM -0.432292
5   Sampling2  LogisticRegression  0.590774
6   Sampling3        RandomForest  0.956251
7   Sampling3                 SVM -0.531198
8   Sampling3  LogisticRegression  0.453144
9   Sampling4        RandomForest  0.883838
10  Sampling4                 SVM -0.277778
11  Sampling4  LogisticRegression  0.535354
12  Sampling5        RandomForest  0.869318
13  Sampling5                 SVM -0.263258
14  Sampling5  LogisticRegression  0.564394

Best results for each model:
     Sampling               Model        R2
2   Sampling1  LogisticRegression  0.651203
0   Sampling1        RandomForest  0.989100
13  Sampling5                 SVM -0.263258


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
