In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
import math

df = pd.read_csv('Creditcard_data.csv')


X = df.drop('Class', axis=1)
y = df['Class']
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)

# Formula for Sample Size (n) based on Z-score, Margin of Error (e), and Proportion (p)
# n = (Z^2 * p * (1-p)) / e^2
Z = 1.96 # 95% Confidence Level
p = 0.5
e = 0.05
sample_size = math.ceil((Z**2 * p * (1-p)) / (e**2))



# Sample 1: Simple Random Sampling
s1 = balanced_df.sample(n=sample_size, random_state=1)

# Sample 2: Systematic Sampling
k = len(balanced_df) // sample_size
s2 = balanced_df.iloc[::k][:sample_size]

# Sample 3: Stratified Sampling
# We use 'Class' as the strata
s3 = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(sample_size//2, random_state=3))

# Sample 4: Cluster Sampling
cluster_size = 10
balanced_df['Cluster'] = np.arange(len(balanced_df)) // cluster_size
selected_clusters = np.random.choice(balanced_df['Cluster'].unique(), size=sample_size // cluster_size, replace=False)
s4 = balanced_df[balanced_df['Cluster'].isin(selected_clusters)].drop('Cluster', axis=1)
balanced_df.drop('Cluster', axis=1, inplace=True)

# Sample 5: Bootstrap Sampling (Sampling with replacement)
s5 = balanced_df.sample(n=sample_size, replace=True, random_state=5)

samples = [s1, s2, s3, s4, s5]

models = {
    "M1 (Logistic Regression)": LogisticRegression(max_iter=1000),
    "M2 (Decision Tree)": DecisionTreeClassifier(),
    "M3 (Random Forest)": RandomForestClassifier(),
    "M4 (SVM)": SVC(),
    "M5 (KNN)": KNeighborsClassifier()
}

results = {}

for i, sample in enumerate(samples, 1):
    sample_accuracies = []
    X_s = sample.drop('Class', axis=1)
    y_s = sample['Class']

    X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size=0.2, random_state=42)

    for name, model in models.items():
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test, pred) * 100
        sample_accuracies.append(round(acc, 2))

    results[f"Sampling{i}"] = sample_accuracies

final_table = pd.DataFrame(results, index=models.keys())
print(final_table)

  s3 = balanced_df.groupby('Class', group_keys=False).apply(lambda x: x.sample(sample_size//2, random_state=3))
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scik

                          Sampling1  Sampling2  Sampling3  Sampling4  \
M1 (Logistic Regression)      93.51      89.61      89.61      85.53   
M2 (Decision Tree)            98.70      97.40      92.21     100.00   
M3 (Random Forest)           100.00     100.00      98.70      98.68   
M4 (SVM)                      67.53      75.32      64.94      71.05   
M5 (KNN)                      94.81      93.51      87.01      97.37   

                          Sampling5  
M1 (Logistic Regression)      93.51  
M2 (Decision Tree)            98.70  
M3 (Random Forest)           100.00  
M4 (SVM)                      72.73  
M5 (KNN)                      93.51  
