In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss

from sklearn.tree import DecisionTreeClassifier             #   Decision Tree Classifier 
from sklearn.ensemble import RandomForestClassifier         #   Random Forest Classifier
from sklearn.naive_bayes import GaussianNB                  #   Gaussian Naive Bayes'
from sklearn.svm import SVC                                 #   Support Vector Classifier
from xgboost import XGBClassifier                           #   XGBoost Classifier

import numpy as np
import pandas as pd

In [3]:
url = 'https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv'
data = pd.read_csv(url)

# Split the data into features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the z-value and margin of error for each sampling technique

z = 1.96  # 95% confidence interval
m = 0.05  # margin of error

In [4]:
X_train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
352,259,-0.363608,1.104008,1.300678,0.070314,0.075180,-1.010396,0.683396,-0.073378,-0.401866,...,0.123176,-0.261158,-0.685436,-0.013662,0.336313,-0.157577,0.071699,0.245539,0.099037,0.99
60,41,0.986063,-0.202965,-0.492768,0.407691,0.305660,-0.230529,0.585028,-0.208225,-0.247503,...,0.263505,-0.305874,-1.216555,-0.077602,-0.741341,0.286881,0.200347,-0.075203,0.027271,169.05
558,417,-0.473731,0.697340,2.279600,1.359875,0.342429,1.392886,0.289971,0.170677,0.578966,...,0.330993,-0.462425,-0.486810,-0.235667,-0.726568,0.085981,-0.351095,0.289067,-0.043030,8.61
350,259,1.095067,-0.014393,1.408552,1.266546,-0.944751,0.029578,-0.598515,0.175291,0.485231,...,-0.072113,0.013107,0.248009,-0.002564,0.570100,0.387137,-0.442319,0.074531,0.032215,9.99
696,525,-0.755011,-0.517761,1.760091,-0.654206,-0.039143,-0.492847,-0.047345,0.118936,0.734444,...,0.239908,-0.049608,-0.200904,0.268931,0.108087,-0.468660,0.729549,-0.017462,0.077163,79.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,46,-0.378245,0.732925,-0.120154,0.185755,2.594269,3.797183,0.059088,0.976768,-0.412661,...,0.315572,-0.107582,-0.157140,-0.194659,1.013897,0.145503,-0.237620,0.411372,0.202788,11.45
106,70,-0.426072,-0.060304,2.220828,0.024742,-0.584964,0.460623,-0.322526,0.434776,1.252404,...,-0.200077,0.149485,0.769878,-0.092634,0.150536,-0.234230,0.504710,0.069158,0.041024,21.80
270,190,-0.549414,0.676861,2.151950,1.014523,-0.620012,0.076154,0.041578,0.342672,0.124723,...,0.104755,0.212024,0.850203,-0.185597,0.544990,-0.130609,-0.196374,0.422119,0.203313,20.70
435,313,-0.907420,1.103912,1.288489,1.243612,-0.068032,0.214040,0.324000,0.436037,-0.437409,...,0.028251,0.022520,0.399523,-0.049081,0.220258,-0.162924,-0.286994,0.015071,-0.104668,15.08


In [5]:
X_train.shape

(617, 30)

In [6]:
X_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
538,404,0.638806,1.772451,-1.748258,1.297700,1.785872,-1.050197,1.485730,-0.780631,0.416991,...,0.589654,-0.359613,-0.061540,-0.051428,-0.764738,-0.424181,-0.418629,-0.003258,-0.541237,0.99
213,140,1.007947,-1.289492,0.666741,-0.741321,-1.290523,0.302172,-1.026907,0.220562,-0.607465,...,0.379483,0.530549,1.113561,-0.260042,-0.253242,0.299379,-0.032755,0.017927,0.035667,169.00
361,265,0.073631,1.051207,-0.281223,0.853749,1.065966,1.219197,-1.225597,-2.262214,-0.584441,...,0.420519,-1.150128,0.870673,-0.266733,-1.048732,0.232705,-0.262463,0.187976,0.231428,1.00
417,302,-0.986171,1.732934,0.857587,0.178950,-0.794223,-0.088469,-1.266790,-4.922224,0.010309,...,-0.991289,4.332858,-2.331390,0.568103,0.638796,0.867788,0.181051,0.225638,0.179027,9.99
582,434,-0.679293,1.120837,1.319394,1.249827,1.147786,-0.086534,1.001436,-0.039752,-1.374497,...,-0.011141,0.067521,0.030112,-0.296954,-0.619850,0.282799,-0.059404,0.048695,0.109200,25.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,524,-0.292211,0.838605,1.360847,-0.001346,0.350836,-0.894645,1.382872,-0.431655,-0.400719,...,-0.000325,0.095844,0.368186,-0.150624,0.391048,0.186411,-0.571952,-0.135755,-0.176200,52.05
446,323,-0.850731,0.695703,2.071838,0.928201,-0.032108,0.648932,0.150909,0.440486,-0.564489,...,0.301436,0.211628,0.591596,-0.210962,-0.341889,0.281912,-0.067353,0.339467,0.138091,52.00
537,403,1.085214,-0.228464,0.774503,1.379282,-0.469900,0.765731,-0.589284,0.426976,0.865987,...,-0.262478,-0.069035,0.049995,-0.111814,-0.319642,0.574092,-0.250482,0.057517,0.006986,15.00
762,572,1.063195,-1.156247,-0.814094,-0.462132,1.137835,3.909057,-1.116745,0.920514,-0.600150,...,-0.112384,-0.205083,-0.434832,-0.220055,0.991996,0.631624,-0.223179,0.052542,0.053525,158.00


In [7]:
X_test.shape

(155, 30)

In [8]:
n1 = int(np.ceil((z**2 * 0.5 * 0.5) / (m**2)))

n2 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))

n3 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))

n4 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))

n5 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))


model1 = GaussianNB()
model2 = SVC(random_state=11)
model3 = RandomForestClassifier(random_state=11)
model4 = DecisionTreeClassifier(random_state=11)
model5 = XGBClassifier()


sampler1 = SMOTE(sampling_strategy='minority', random_state=11)
sampler2 = NearMiss(version=3, n_neighbors=3)
sampler3 = TomekLinks(sampling_strategy='majority')
sampler4 = RandomOverSampler(sampling_strategy='minority', random_state=11)
sampler5 = RandomUnderSampler(sampling_strategy='majority', random_state=11)


In [9]:
param_grid = { 
    'n_estimators': [50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9,10],
    'criterion' :['gini', 'entropy']
}

In [10]:
#from sklearn.model_selection import GridSearchCV

#clf_Random_Forest_Classifier = GridSearchCV(estimator=clf_Random_Forest_Classifier, param_grid=param_grid, cv= 5)
#clf_Random_Forest_Classifier.fit(X_train, Y_train)

In [11]:
#clf_Random_Forest_Classifier.best_params_

In [12]:
# Define a dictionary to hold the sampling techniques and models

samplers = {
    'SAMPLING-1': sampler1,
    'SAMPLING-2': sampler2,
    'SAMPLING-3': sampler3,
    'SAMPLING-4': sampler4,
    'SAMPLING-5': sampler5,
}

models = {
    'MODEL_1': model1,
    'MODEL_2': model2,
    'MODEL_3': model3,
    'MODEL_4': model4,
    'MODEL_5': model5,
}

# Evaluate each model on each sampling technique

results = {}

for sampler_name, sampler in samplers.items():
    if sampler_name == 'Sampling1':
        n = n1
    elif sampler_name == 'Sampling2':
        n = n2
    elif sampler_name == 'Sampling3':
        n = n3
    elif sampler_name == 'Sampling4':
        n = n4
    else:
        n = n5

    # Undersample or oversample the training data
    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    
    # Limit the resampled data to the sample size
    if len(X_resampled) > n:
        X_resampled = X_resampled[:n]
        y_resampled = y_resampled[:n]
    
    for model_name, model in models.items():
        # Train the model on the resampled data
        model.fit(X_resampled, y_resampled)
        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        
        # Calculate the accuracy score
        accuracy = accuracy_score(y_test, y_pred)
        
        # Add the accuracy score to the results dictionary
        if model_name in results:
            results[model_name][sampler_name] = accuracy
        else:
            results[model_name] = {sampler_name: accuracy}

In [13]:
print('Results:')

print('         Sampling1     Sampling2     Sampling3     Sampling4     Sampling5')

for model_name, model_results in results.items():
    print(model_name, end='')
    for sampler_name in samplers.keys():
        if sampler_name in model_results:
            print(f'    {model_results[sampler_name]:.4f}    ', end='')
        else:
            print('              ', end='')
    print() 

Results:
         Sampling1     Sampling2     Sampling3     Sampling4     Sampling5
MODEL_1    0.9935        0.4839        0.9935        0.9935        0.7290    
MODEL_2    0.9935        0.5161        0.9935        0.9935        0.6000    
MODEL_3    0.9935        0.7419        0.9935        0.9935        0.5806    
MODEL_4    0.9613        0.8194        0.9613        0.9613        0.5032    
MODEL_5    0.9935        0.8452        0.9935        0.9935        0.4968    
