In [1]:
pip install pandas numpy scikit-learn imbalanced-learn



In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

In [4]:
df = pd.read_csv("Creditcard_data.csv")

X = df.drop("Class", axis=1)
y = df["Class"]

In [5]:
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,575,-0.572263,0.731748,1.541254,0.150506,1.108974,0.372152,1.084879,-0.146329,-0.274447,...,-0.143508,-0.107582,-0.418263,-0.731029,0.877525,-0.364150,-0.177509,-0.256545,26.72,0
768,579,-1.296845,-0.511605,2.404726,-0.310762,-0.319551,-0.542842,-0.173310,0.260423,-1.202688,...,-0.071270,-0.161175,0.088496,0.285390,0.281069,-0.370130,0.043410,0.092318,80.00,0
769,579,1.214170,0.210481,0.484651,0.479768,-0.261955,-0.527039,0.021782,-0.106888,-0.037631,...,-0.224292,-0.594609,0.159877,0.091873,0.140964,0.227406,-0.017389,0.016030,5.98,0
770,580,1.267030,-0.071114,0.037680,0.512683,0.242392,0.705212,-0.226582,0.109483,0.657565,...,-0.164468,-0.177225,-0.222918,-1.245505,0.678360,0.525059,0.002920,-0.003333,12.36,0


In [6]:
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X, y)

balanced_df = pd.concat([X_bal, y_bal], axis=1)

In [7]:
def simple_random_sampling(df, frac=0.7):
    return df.sample(frac=frac, random_state=42)

In [8]:
def stratified_sampling(X, y, test_size=0.3):
    X_train, _, y_train, _ = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=42
    )
    return pd.concat([X_train, y_train], axis=1)

In [9]:
def systematic_sampling(df, k=10):
    return df.iloc[::k, :]

In [10]:
from sklearn.cluster import KMeans

def cluster_sampling(df, n_clusters=5):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df["cluster"] = kmeans.fit_predict(df.drop("Class", axis=1))

    selected_clusters = np.random.choice(df["cluster"].unique(), size=2, replace=False)
    sampled_df = df[df["cluster"].isin(selected_clusters)]

    return sampled_df.drop("cluster", axis=1)

In [11]:
def bootstrap_sampling(df):
    return df.sample(n=len(df), replace=True, random_state=42)

In [12]:
samples = {
    "Simple Random": simple_random_sampling(balanced_df),
    "Stratified": stratified_sampling(X_bal, y_bal),
    "Systematic": systematic_sampling(balanced_df),
    "Cluster": cluster_sampling(balanced_df.copy()),
    "Bootstrap": bootstrap_sampling(balanced_df)
}

In [13]:
models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_KNN": KNeighborsClassifier(),
    "M5_SVM": SVC()
}

In [14]:
results = pd.DataFrame(index=models.keys(), columns=samples.keys())

for samp_name, samp_df in samples.items():
    X_s = samp_df.drop("Class", axis=1)
    y_s = samp_df["Class"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_s, y_s, test_size=0.3, random_state=42
    )

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        results.loc[model_name, samp_name] = round(acc * 100, 2)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
print("Accuracy Comparison Table (%)")
print(results)

Accuracy Comparison Table (%)
                Simple Random Stratified Systematic Cluster Bootstrap
M1_Logistic              89.1      91.59      80.43    95.2     94.54
M2_DecisionTree         96.88      96.26      93.48    97.6     99.78
M3_RandomForest         99.38      100.0      93.48    99.2     100.0
M4_KNN                  82.87       81.0       58.7    83.2     89.52
M5_SVM                  64.17      66.67      71.74    64.0     69.21
