In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks


*STEP 1: LOAD DATASET* 

In [2]:
data = pd.read_csv("Creditcard_data.csv")

print("Original Class Distribution:")
print(data['Class'].value_counts())

Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64


*STEP 2: BALANCE DATASET USING SMOTE*

In [3]:
X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

balanced_data = pd.concat([pd.DataFrame(X_bal), pd.DataFrame(y_bal, columns=['Class'])], axis=1)

print("\nBalanced Class Distribution:")
print(balanced_data['Class'].value_counts())


Balanced Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64


*STEP 3: CREATE FIVE SAMPLES*

In [4]:
samples = []

for i in range(5):
    sample = balanced_data.sample(frac=0.2, random_state=i)
    samples.append(sample)

print("\nFive samples created.")


Five samples created.


*STEP 4: DEFINE SAMPLING TECHNIQUE*

In [5]:
sampling_methods = {
    "Sampling1": RandomUnderSampler(random_state=1),
    "Sampling2": RandomOverSampler(random_state=1),
    "Sampling3": SMOTE(random_state=1),
    "Sampling4": TomekLinks(),
    "Sampling5": NearMiss()
}

*STEP 5: DEFINE MACHINE LEARNING MODELS*

In [6]:
models = {
    "M1": LogisticRegression(max_iter=1000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": SVC(),
    "M5": KNeighborsClassifier()
}

*STEP 6: TRAIN MODELS AND EVALUATE*

In [7]:
results = {}

for s_name, sampler in sampling_methods.items():

    results[s_name] = {}

    for m_name, model in models.items():

        # Using first sample for demonstration
        sample = samples[0]

        X_sample = sample.drop('Class', axis=1)
        y_sample = sample['Class']

        # Apply Sampling Technique
        try:
            X_res, y_res = sampler.fit_resample(X_sample, y_sample)
        except:
            X_res, y_res = X_sample, y_sample

        # Train-Test Split
        X_train, X_test, y_train, y_test = train_test_split(
            X_res, y_res, test_size=0.3, random_state=42
        )

        # Train Model
        model.fit(X_train, y_train)

        # Predict
        y_pred = model.predict(X_test)

        # Accuracy
        acc = accuracy_score(y_test, y_pred)

        results[s_name][m_name] = round(acc * 100, 2)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


*STEP 7: DISPLAY RESULTS*

In [8]:
result_df = pd.DataFrame(results)

print("\nFinal Accuracy Table:")
print(result_df)


Final Accuracy Table:
    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1      94.19      87.76      87.76      87.36      86.05
M2      94.19      93.88      93.88      88.51      95.35
M3     100.00      97.96      97.96      97.70      97.67
M4      66.28      51.02      52.04      60.92      59.30
M5      77.91      74.49      68.37      80.46      69.77


*STEP 8: SAVE RESULTS*

In [9]:
result_df.to_csv("sampling_results.csv")

print("\nResults saved to sampling_results.csv")


Results saved to sampling_results.csv
