In [2]:
pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"  # Path to your uploaded file
data = pd.read_csv(file_path)

# Ensure target column 'Class' exists
if "Class" not in data.columns:
    raise KeyError("The dataset does not have a 'Class' column. Please verify the target variable.")

# Split features and target
X = data.drop("Class", axis=1)
y = data["Class"]

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Confirm the balancing
print(f"Before SMOTE: {Counter(y)}")
print(f"After SMOTE: {Counter(y_resampled)}")

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define sampling techniques
def random_sampling(data, n):
    return data.sample(n, random_state=42).reset_index(drop=True)

def systematic_sampling(data, n):
    step = len(data) // n
    indices = np.arange(0, len(data), step)[:n]
    return data.iloc[indices].reset_index(drop=True)

def stratified_sampling(X, y, n):
    X_sample, _, y_sample, _ = train_test_split(X, y, test_size=1 - (n / len(y)), stratify=y, random_state=42)
    return X_sample.reset_index(drop=True), y_sample.reset_index(drop=True)

def cluster_sampling(data, clusters):
    data = data.copy()
    data['cluster'] = np.random.randint(0, clusters, len(data))
    cluster_data = data[data['cluster'] == 0].drop("cluster", axis=1)
    return cluster_data.reset_index(drop=True)

# Define sample size
sample_size = 1000
X_train_df = pd.DataFrame(X_train_scaled)
y_train_df = pd.Series(y_train).reset_index(drop=True)

# Generate samples
random_sample = random_sampling(X_train_df, sample_size)
systematic_sample = systematic_sampling(X_train_df, sample_size)
stratified_sample_X, stratified_sample_y = stratified_sampling(X_train_df, y_train_df, sample_size)
cluster_sample = cluster_sampling(X_train_df, clusters=10)

# Define models
models = {
    "M1": LogisticRegression(max_iter=5000),
    "M2": RandomForestClassifier(),
    "M3": GradientBoostingClassifier(),
    "M4": SVC(),
    "M5": DecisionTreeClassifier(),
}

# Apply models on samples
results = {}
samples = [
    (random_sample, y_train_df.iloc[random_sample.index]),
    (systematic_sample, y_train_df.iloc[systematic_sample.index]),
    (stratified_sample_X, stratified_sample_y),
    (cluster_sample, y_train_df.iloc[cluster_sample.index]),
]

for i, (sample, sample_y) in enumerate(samples):
    results[f"Sampling{i+1}"] = {}
    for model_name, model in models.items():
        model.fit(sample, sample_y)  # Fit model on the sample
        accuracy = model.score(X_test_scaled, y_test)
        results[f"Sampling{i+1}"][model_name] = accuracy

# Print results
for sampling, model_results in results.items():
    print(f"{sampling}:")
    for model, acc in model_results.items():
        print(f"  {model}: {acc:.2f}")

Before SMOTE: Counter({0: 763, 1: 9})
After SMOTE: Counter({0: 763, 1: 763})
Sampling1:
  M1: 0.42
  M2: 0.52
  M3: 0.49
  M4: 0.47
  M5: 0.54
Sampling2:
  M1: 0.91
  M2: 0.99
  M3: 0.98
  M4: 0.96
  M5: 0.97
Sampling3:
  M1: 0.92
  M2: 0.99
  M3: 0.99
  M4: 0.96
  M5: 0.99
Sampling4:
  M1: 0.49
  M2: 0.42
  M3: 0.48
  M4: 0.51
  M5: 0.49
