# **Sampling**

# Importing libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load dataset

In [5]:
df = pd.read_csv('Creditcard_data.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Convert dataset into balanced class dataset by Oversampling minority class

In [6]:
df_majority = df[df.Class == 0]
df_minority = df[df.Class == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Feature engineering

In [7]:
# Feature Scaling
# Logistic Regression (M1) and SVC (M4) require scaled data for convergence
scaler = StandardScaler()
X_full = df_balanced.drop('Class', axis=1)
y_full = df_balanced['Class']
X_scaled = scaler.fit_transform(X_full)

# Sampling data

## **Calculate sample size**

In [9]:
df_balanced_scaled = pd.DataFrame(X_scaled, columns=X_full.columns)
df_balanced_scaled['Class'] = y_full.values

# Sample Size
# n = (Z^2 * p * (1-p)) / (E^2)
Z = 1.96  # 95% confidence
p = 0.5   # Expected proportion
E = 0.05  # Margin of error
sample_size = int(np.ceil((Z**2 * p * (1-p)) / (E**2)))

## **Sampling techniques**

In [10]:
def get_samples(df, n):
    samples = {}

    # Sampling 1: Simple Random Sampling
    samples['Sampling1'] = df.sample(n=n, random_state=1)

    # Sampling 2: Systematic Sampling
    k = len(df) // n
    samples['Sampling2'] = df.iloc[::k][:n]

    # Sampling 3: Stratified Sampling (Manually ensuring class balance)
    s3_0 = df[df.Class == 0].sample(n // 2, random_state=3)
    s3_1 = df[df.Class == 1].sample(n // 2, random_state=3)
    samples['Sampling3'] = pd.concat([s3_0, s3_1])

    # Sampling 4: Cluster Sampling
    num_clusters = 20
    df_cluster = df.copy()
    df_cluster['cluster'] = np.repeat(np.arange(num_clusters), len(df_cluster)//num_clusters + 1)[:len(df_cluster)]
    np.random.seed(42)
    selected_clusters = np.random.choice(np.arange(num_clusters), size=num_clusters//2, replace=False)
    cluster_sample = df_cluster[df_cluster['cluster'].isin(selected_clusters)]
    samples['Sampling4'] = cluster_sample.sample(n=n, random_state=4)

    # Sampling 5: Bootstrap Sampling (Random sampling with replacement)
    samples['Sampling5'] = df.sample(n=n, replace=True, random_state=5)

    return samples

all_samples = get_samples(df_balanced_scaled, sample_size)

# ML Models

In [11]:
models = {
    'M1': LogisticRegression(max_iter=2000, random_state=42),
    'M2': RandomForestClassifier(random_state=42),
    'M3': DecisionTreeClassifier(random_state=42),
    'M4': SVC(random_state=42),
    'M5': KNeighborsClassifier()
}

# Model evaluation

In [12]:
results = {}

for s_name, s_data in all_samples.items():
    # Split features and target
    X = s_data.drop(['Class', 'cluster'] if 'cluster' in s_data.columns else ['Class'], axis=1)
    y = s_data['Class']

    # Split into Train/Test for internal validation of the sample
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    sample_accuracies = {}
    for m_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        sample_accuracies[m_name] = round(acc * 100, 2)

    results[s_name] = sample_accuracies

# Output

In [13]:
final_results_df = pd.DataFrame(results)
print("Accuracy Results Matrix (%):")
print(final_results_df)

best_tech = final_results_df.idxmax(axis=1)
print("\nBest Sampling Technique per Model:")
print(best_tech)

Accuracy Results Matrix (%):
    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1      89.61      85.71      88.31      89.61      96.10
M2     100.00     100.00      98.70     100.00     100.00
M3     100.00     100.00      87.01      94.81      94.81
M4      98.70      96.10      97.40      94.81      98.70
M5      98.70      90.91      92.21      87.01      98.70

Best Sampling Technique per Model:
M1    Sampling5
M2    Sampling1
M3    Sampling1
M4    Sampling1
M5    Sampling1
dtype: object
