In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('Creditcard_data.csv')
print(df.shape)

(772, 31)


In [3]:
print(df['Class'].value_counts())

Class
0    763
1      9
Name: count, dtype: int64


In [4]:
majority = df[df['Class'] == 0]
minority = df[df['Class'] == 1]

print(len(majority), len(minority))

763 9


Method used For Balancing the Dataset - Undersampling the majority class

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
ros = RandomOverSampler(random_state=42)
X_bal, y_bal = ros.fit_resample(X, y)

In [7]:
balanced_df = pd.concat([X_bal, y_bal], axis=1)
print(balanced_df['Class'].value_counts())

Class
0    763
1    763
Name: count, dtype: int64


Sampling 1: Simple Random Sampling

In [8]:
def simple_random_sampling(df, frac=0.7):
    return df.sample(frac=frac, random_state=42)

Sampling 2: Stratified Sampling

In [None]:
def stratified_sampling(df, target="Class", frac=0.7):
    return df.groupby(target, group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=42)
    )

Sampling 3: Systematic Sampling

In [None]:
def systematic_sampling(df, step=2):
    return df.iloc[::step]

Sampling 4: Bootstrap Sampling

In [11]:
def bootstrap_sampling(df):
    return df.sample(frac=1, replace=True, random_state=42)

Sampling 5: Cluster Sampling

In [None]:
def cluster_sampling(df, cluster_col="Time"):
    df["cluster"] = pd.qcut(df[cluster_col], q=5, labels=False)
    chosen_cluster = np.random.choice(df["cluster"].unique())
    return df[df["cluster"] == chosen_cluster].drop("cluster", axis=1)

In [13]:
sampling_methods = {
    "SimpleRandom": simple_random_sampling(balanced_df),
    "Systematic": systematic_sampling(balanced_df),
    "Stratified": stratified_sampling(balanced_df),
    "Cluster": cluster_sampling(balanced_df),
    "Bootstrap": bootstrap_sampling(balanced_df)
}

  return df.groupby(target, group_keys=False).apply(


In [None]:
models = {
    "Gaussian": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC()
}

In [None]:
results = {}

for samp_name, samp_df in sampling_methods.items():
    X_s = samp_df.drop("Class", axis=1)
    y_s = samp_df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.3, stratify=y_s, random_state=42
    )

    results[samp_name] = {}

    for model_name, model in models.items():
        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        results[samp_name][model_name] = round(acc * 100, 2)


results_df = pd.DataFrame(results).T
print("\nAccuracy Table (%):\n")
print(results_df)


Accuracy Table (%):

              Gaussian  Decision Tree  Random Forest    KNN     SVC
SimpleRandom     77.26          99.38          100.0  97.51   97.82
Systematic       80.79          98.25          100.0  96.51   96.94
Stratified       79.75          97.82          100.0  95.64   97.51
Cluster         100.00          98.70          100.0  98.70  100.00
Bootstrap        85.81          99.56          100.0  98.69   98.69


In [16]:
results_df.to_csv("result.csv")
print("Results saved to result.csv")

Results saved to result.csv
