In [3]:
import numpy as np
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss

# Download dataset from GitHub
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
file_path = "Creditcard_data.csv"
response = requests.get(url)
with open(file_path, "wb") as file:
    file.write(response.content)

# Load dataset
df = pd.read_csv(file_path)

# Assuming 'Class' is the target variable
X = df.drop(columns=['Class'])
y = df['Class']

# Standardizing features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Sample size detection formula
def compute_sample_size(p=0.5, Z=1.96, E=0.05):
    return int((Z**2 * p * (1 - p)) / (E**2))

sample_size = compute_sample_size()

def balance_dataset(X, y, technique):
    if technique == "Sampling1":
        sampler = RandomUnderSampler()
    elif technique == "Sampling2":
        sampler = RandomOverSampler()
    elif technique == "Sampling3":
        sampler = SMOTE()
    elif technique == "Sampling4":
        sampler = SMOTE(k_neighbors=3)
    elif technique == "Sampling5":
        sampler = NearMiss()
    else:
        return X, y  # No resampling

    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

# Define models
models = {
    "M1": RandomForestClassifier(),
    "M2": LogisticRegression(max_iter=500, solver='saga'),  # Increased iterations and changed solver
    "M3": SVC(),
    "M4": DecisionTreeClassifier(),
    "M5": KNeighborsClassifier()
}

# Sampling techniques
sampling_techniques = ["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"]

# Store results
results = {}

# Train and evaluate models
for model_name, model in models.items():
    results[model_name] = {}
    for technique in sampling_techniques:
        X_res, y_res = balance_dataset(X, y, technique)
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name][technique] = accuracy

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Performance (Accuracy):\n")
print(results_df.T)

# Save results for GitHub submission
results_df.to_csv("Model_Accuracy_Results.csv", index=True)





Model Performance (Accuracy):

    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1       0.50   1.000000   0.993464   0.993464       0.75
M2       0.75   0.931373   0.928105   0.931373       1.00
M3       0.00   0.960784   0.967320   0.970588       1.00
M4       0.50   0.996732   0.967320   0.983660       0.75
M5       0.00   0.964052   0.944444   0.954248       1.00


In [4]:
import numpy as np
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, NearMiss

# Download dataset from GitHub
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
file_path = "Creditcard_data.csv"
response = requests.get(url)
with open(file_path, "wb") as file:
    file.write(response.content)

# Load dataset
df = pd.read_csv(file_path)

# Assuming 'Class' is the target variable
X = df.drop(columns=['Class'])
y = df['Class']

# Standardizing features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Sample size detection formula
def compute_sample_size(p=0.5, Z=1.96, E=0.05):
    return int((Z**2 * p * (1 - p)) / (E**2))

sample_size = compute_sample_size()

def balance_dataset(X, y, technique):
    if technique == "Sampling1":
        sampler = RandomUnderSampler()
    elif technique == "Sampling2":
        sampler = RandomOverSampler()
    elif technique == "Sampling3":
        sampler = SMOTE()
    elif technique == "Sampling4":
        sampler = SMOTE(k_neighbors=3)
    elif technique == "Sampling5":
        sampler = NearMiss()
    else:
        return X, y  # No resampling

    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

# Define models
models = {
    "M1": RandomForestClassifier(),
    "M2": LogisticRegression(max_iter=500, solver='saga'),  # Increased iterations and changed solver
    "M3": SVC(),
    "M4": DecisionTreeClassifier(),
    "M5": KNeighborsClassifier()
}

# Sampling techniques
sampling_techniques = ["Sampling1", "Sampling2", "Sampling3", "Sampling4", "Sampling5"]

# Store results
results = {}

# Train and evaluate models
for model_name, model in models.items():
    results[model_name] = {}
    for technique in sampling_techniques:
        X_res, y_res = balance_dataset(X, y, technique)
        X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name][technique] = accuracy

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nModel Performance (Accuracy):\n")
print(results_df.T)

# Determine the best sampling technique for each model
best_techniques = results_df.idxmax()
print("\nBest Sampling Technique per Model:\n")
print(best_techniques)

# Save results for GitHub submission
results_df.to_csv("Model_Accuracy_Results.csv", index=True)
best_techniques.to_csv("Best_Sampling_Techniques.csv", index=True)





Model Performance (Accuracy):

    Sampling1  Sampling2  Sampling3  Sampling4  Sampling5
M1       0.25   1.000000   0.990196   0.993464       0.75
M2       0.00   0.931373   0.931373   0.928105       1.00
M3       0.00   0.960784   0.967320   0.970588       1.00
M4       0.00   0.993464   0.980392   0.986928       0.75
M5       0.25   0.964052   0.944444   0.954248       1.00

Best Sampling Technique per Model:

M1    Sampling2
M2    Sampling5
M3    Sampling5
M4    Sampling2
M5    Sampling5
dtype: object
