In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


from sklearn.utils import resample


In [2]:
data = pd.read_csv("Creditcard_data.csv")

X = data.drop(columns=["Class"])
y = data["Class"]


In [4]:
from imblearn.over_sampling import RandomOverSampler

yayy = RandomOverSampler(random_state=42)
X_bal, y_bal = yayy.fit_resample(X, y)

balanced_data = pd.concat([X_bal, y_bal], axis=1)


In [5]:
srs = balanced_data.sample(frac=0.5, random_state=42)
X_srs = srs.drop(columns=["Class"])
y_srs = srs["Class"]
#doing it with srs

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit #statified

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_idx, _ in sss.split(X_bal, y_bal):
    X_strat = X_bal.iloc[train_idx]
    y_strat = y_bal.iloc[train_idx]


In [11]:
balanced_data["cluster"] = pd.qcut(balanced_data.index, q=5, labels=False) #cluster sampling

sel= np.random.choice(balanced_data["cluster"].unique(), size=3, replace=False)
cluster = balanced_data[balanced_data["cluster"].isin(sel)]

X_cluster = cluster.drop(columns=["Class", "cluster"])
y_cluster = cluster["Class"]


In [12]:
k = 5
systematic_sample = balanced_data.iloc[::k]

X_sys = systematic_sample.drop(columns=["Class"])
y_sys = systematic_sample["Class"]
#systematic sampling

In [10]:
boot = resample(balanced_data, replace=True, n_samples=len(balanced_data), random_state=42)

X_boot = boot.drop(columns=["Class"])
y_boot = boot["Class"]
#bootstrap sampling

In [22]:
models = {
    "M1": LogisticRegression(max_iter=10000),
    "M2": DecisionTreeClassifier(),
    "M3": RandomForestClassifier(),
    "M4": GaussianNB(),
    "M5": KNeighborsClassifier(n_neighbors=5)

}


In [23]:
samples = {
    "sample1": (X_srs, y_srs),
    "sample2": (X_strat, y_strat),
    "sample3": (X_cluster, y_cluster),
    "sample4": (X_sys, y_sys),
    "sample5": (X_boot, y_boot)
}

results = pd.DataFrame(index=models.keys(), columns=samples.keys())

for samp_name, (Xs, ys) in samples.items():
    X_train, X_test, y_train, y_test = train_test_split(
        Xs, ys, test_size=0.3, random_state=42
    )

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred) * 100
        results.loc[model_name, samp_name] = round(acc, 2)


In [24]:
print(results)


   sample1 sample2 sample3 sample4 sample5
M1   87.77   90.83   86.55   93.48   99.13
M2   98.69   97.38   99.27   97.83   99.56
M3   100.0   100.0   100.0   97.83   99.56
M4   77.29   75.11   86.55   90.22   92.79
M5   97.38    95.2   98.55   89.13   99.34


| Model                     | Best Sample                   | Accuracy   |
| ------------------------- | ----------------------------- | ---------- |
| M1   Logistic Regression  | **sample5**                   | **99.13%** |
| M2   Decision Tree        | **sample5**                   | **99.56%** |
| M3   Random Forest.       | **sample1, sample2, sample3** | **100%**   |
| M4   Naive Bayes          | **sample5**                   | **92.79%** |
| M5   KNN                  | **sample5**                   | **99.34%** |
