**Assignment**

1. Import data

In [18]:
import pandas as pd

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)

print(data['Class'].value_counts())
df = data.copy()

Class
0    763
1      9
Name: count, dtype: int64


2. Divide into X and Y

In [19]:
x = data.drop('Class', axis=1)
y = data['Class']

3. SMOTE resampling

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_balanced, y_balanced = smote.fit_resample(x,y)

In [21]:
print(y_balanced.value_counts())

Class
0    763
1    763
Name: count, dtype: int64


4. Train Test Split

In [22]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)

5. Make one dataframe

In [23]:
import numpy as np
df = x_balanced.copy()
df['Class'] = y_balanced

6. Apply different sampling methods

Simple Random Sampling

In [24]:
sampling1 = df.sample(frac=0.5,random_state=42)

Systematic Sampling

In [25]:
k = 2
sampling2 = df.iloc[::k]

Stratified Sampling

In [26]:
x_s,_,y_s,_ = train_test_split(x,y, test_size=0.5, random_state=42, stratify=y)
sampling3 = x_s.copy()
sampling3['Class'] = y_s

Cluster Sampling


In [27]:
df['Cluster'] = df.index % 5
sampling4 = df[df['Cluster'] == 0].drop('Cluster', axis=1)

Bootstrap Sampling

In [28]:
sampling5 = df.sample(frac = 1, replace=True,random_state=42)

7. Model Selection

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    "Model 1_LogisticRegression": LogisticRegression(max_iter=2000),
    "Model 2_DecisionTree": DecisionTreeClassifier(),
    "Model 3_RandomForest": RandomForestClassifier(),
    "Model 4_KNN": KNeighborsClassifier(),
    "Model 5_SVM": SVC()
}
samples = {
    "Sampling1": sampling1,
    "Sampling2": sampling2,
    "Sampling3": sampling3,
    "Sampling4": sampling4,
    "Sampling5": sampling5
}

8. Training on sampled data

In [30]:
from sklearn.metrics import accuracy_score

performance_table = {}

for sampling_label, data_part in samples.items():

    features = data_part.drop(columns=['Class'])
    target = data_part['Class']

    # skip if only one class present after sampling
    if target.nunique() < 2:
        print(f"Skipping {sampling_label}: only one class present in sample")
        performance_table[sampling_label] = {label: None for label in models.keys()}
        continue

    X_tr, X_te, y_tr, y_te = train_test_split(
        features,
        target,
        test_size=0.3,
        random_state=42,
        stratify=target
    )

    model_scores = {}

    for model_label, clf in models.items():
        clf.fit(X_tr, y_tr)

        predictions = clf.predict(X_te)

        score = accuracy_score(y_te, predictions)    
        performance_table[sampling_label] = model_scores


        model_scores[model_label] = score

9. Results

In [31]:
final_results = pd.DataFrame(performance_table).T
final_results

Unnamed: 0,Model 1_LogisticRegression,Model 2_DecisionTree,Model 3_RandomForest,Model 4_KNN,Model 5_SVM
Sampling1,0.912664,0.951965,1.0,0.816594,0.668122
Sampling2,0.868996,0.943231,0.978166,0.838428,0.637555
Sampling3,0.982759,0.974138,0.982759,0.982759,0.982759
Sampling4,0.902174,0.902174,0.978261,0.73913,0.652174
Sampling5,0.958515,0.99345,0.997817,0.884279,0.71179
