In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

print("Original class distribution:")
print(df['Class'].value_counts())












Original class distribution:
Class
0    763
1      9
Name: count, dtype: int64


In [None]:
# 2. Oversampling Technique:using SMOTE

X = df.drop("Class", axis=1)
y = df["Class"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

smote = SMOTE(random_state=42)
X_balanced, Y_balanced = smote.fit_resample(X_scaled, y)

balanced_df = pd.DataFrame(X_balanced)
balanced_df['Class'] = Y_balanced

print("\nBalanced class distribution:")
print(balanced_df['Class'].value_counts())


Balanced class distribution:
Class
0    763
1    763
Name: count, dtype: int64


In [None]:
# 3. Create Sampling Techniques


def simple_random_sampling(df, n):
    return df.sample(n=n, random_state=42)

def systematic_sampling(df, k=5):
    return df.iloc[::k]

def stratified_sampling(df, n):
    from sklearn.model_selection import StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits=1, train_size=n, random_state=42)
    for train_idx, _ in split.split(df, df['Class']):
        return df.iloc[train_idx]

def cluster_sampling(df, n_clusters=5):
    df_copy = df.copy()
    df_copy['cluster'] = pd.qcut(df_copy.index, n_clusters, labels=False)
    chosen_cluster = np.random.choice(df_copy['cluster'].unique())
    return df_copy[df_copy['cluster'] == chosen_cluster].drop("cluster", axis=1)

def bootstrap_sampling(df, n):
    return df.sample(n=n, replace=True, random_state=42)


In [None]:
# Create 5 samples
sample_size = 1000

S1 = simple_random_sampling(balanced_df, sample_size)
S2 = systematic_sampling(balanced_df)
S3 = stratified_sampling(balanced_df, sample_size)
S4 = cluster_sampling(balanced_df)
S5 = bootstrap_sampling(balanced_df, sample_size)

samples = [S1, S2, S3, S4, S5]


In [None]:
# 5 models used
models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_KNN": KNeighborsClassifier(),
    "M5_SVM": SVC()
}

In [None]:
# Cross-Validation
results = pd.DataFrame(index=models.keys(),
                       columns=["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"])

for i, sample in enumerate(samples):
    X_s = sample.drop("Class", axis=1)
    y_s = sample["Class"]

    # Check if the sample's target variable has at least two unique classes
    if y_s.nunique() < 2:
        print(f"Warning: Sampling{i+1} (created by {samples[i].name if hasattr(samples[i], 'name') else 'a sampling method'}) contains only one class. Skipping cross-validation for this sample.")
        # Fill results with NaN for this sample if it has only one class
        for model_name in models.keys():
            results.iloc[list(models.keys()).index(model_name), i] = np.nan
        continue # Skip to the next sample

    for model_name, model in models.items():
        scores = cross_val_score(model, X_s, y_s, cv=5, scoring="accuracy")
        results.iloc[list(models.keys()).index(model_name), i] = round(scores.mean()*100, 2)



In [None]:
# Displaying the final output
print("\nFinal Accuracy Table:")
print(results)

print("\nBest sampling technique per model:")
print(results.astype(float).idxmax(axis=1))


Final Accuracy Table:
                Sampling1 Sampling2 Sampling3 Sampling4 Sampling5
M1_Logistic          91.2      85.3      92.2       NaN      94.2
M2_DecisionTree      97.6     86.91      98.4       NaN      98.1
M3_RandomForest      99.6     98.04     100.0       NaN      99.3
M4_KNN               92.4     78.11      92.6       NaN      92.9
M5_SVM               97.7     95.09      97.8       NaN      98.8

Best sampling technique per model:
M1_Logistic        Sampling5
M2_DecisionTree    Sampling3
M3_RandomForest    Sampling3
M4_KNN             Sampling5
M5_SVM             Sampling5
dtype: object
