<a href="https://colab.research.google.com/github/arpit-devop/sampling-assignment/blob/main/predictive_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


# Install imbalanced-learn if not installed
!pip install imbalanced-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTEENN


# 1. Load Dataset

df = pd.read_csv('/content/Creditcard_data.csv')  # Upload file in Colab first

print("Original Class Distribution:")
print(df['Class'].value_counts())

X = df.drop('Class', axis=1)
y = df['Class']


# 2. Define Sampling Techniques

sampling_methods = {
    "Sampling1_RandomOver": RandomOverSampler(),
    "Sampling2_RandomUnder": RandomUnderSampler(),
    "Sampling3_SMOTE": SMOTE(),
    "Sampling4_NearMiss": NearMiss(),
    "Sampling5_SMOTEENN": SMOTEENN()
}


# 3. Define ML Models

models = {
    "M1_Logistic": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(),
    "M3_RandomForest": RandomForestClassifier(),
    "M4_SVM": SVC(),
    "M5_KNN": KNeighborsClassifier()
}


# 4. Apply Sampling + Train Models

results = pd.DataFrame(index=models.keys(), columns=sampling_methods.keys())

for s_name, sampler in sampling_methods.items():

    X_res, y_res = sampler.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.3, random_state=42
    )

    for m_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        results.loc[m_name, s_name] = round(acc * 100, 2)


# 5. Show Accuracy Table

print("\nAccuracy Table (%):")
print(results)


# 6. Best Sampling Technique for Each Model

print("\nBest Sampling Technique for Each Model:")
for model in results.index:
    best_sampling = results.loc[model].astype(float).idxmax()
    best_score = results.loc[model].astype(float).max()
    print(f"{model} → {best_sampling} ({best_score}%)")


Original Class Distribution:
Class
0    763
1      9
Name: count, dtype: int64


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Accuracy Table (%):
                Sampling1_RandomOver Sampling2_RandomUnder Sampling3_SMOTE  \
M1_Logistic                    91.92                 33.33           90.83   
M2_DecisionTree                98.91                 33.33           97.16   
M3_RandomForest                100.0                 16.67           99.13   
M4_SVM                          73.8                 16.67           68.56   
M5_KNN                         98.47                  50.0           81.88   

                Sampling4_NearMiss Sampling5_SMOTEENN  
M1_Logistic                   50.0              97.07  
M2_DecisionTree              16.67              99.71  
M3_RandomForest              16.67              100.0  
M4_SVM                       16.67              73.02  
M5_KNN                       83.33               95.6  

Best Sampling Technique for Each Model:
M1_Logistic → Sampling5_SMOTEENN (97.07%)
M2_DecisionTree → Sampling5_SMOTEENN (99.71%)
M3_RandomForest → Sampling1_RandomOver (100.0