In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NeighbourhoodCleaningRule
from imblearn.over_sampling import RandomOverSampler, ADASYN
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
import numpy as np
import pandas as pd



In [2]:
url = 'https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv'
data = pd.read_csv(url)

X = data.drop('Class', axis=1)
y = data['Class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

z = 2.57  
m = 0.05  


n1 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n2 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n3 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n4 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))
n5 = int(np.ceil((z**2 * 0.05 * (1-0.05)) / (m**2)))


##Using under/over sampling to make the dataset balanced

sm1 = RandomUnderSampler(sampling_strategy='majority', random_state=100)
sm2 = RandomOverSampler(sampling_strategy='minority', random_state=100)
sm3 = ADASYN(sampling_strategy='minority', random_state=100)
sm4 = TomekLinks(sampling_strategy='majority')
sm5 = NeighbourhoodCleaningRule(n_neighbors=3)

m1 = LogisticRegression(random_state=100,max_iter=500)
m2 = DecisionTreeClassifier(random_state=100)
m3 = RandomForestClassifier(random_state=100)
m4 = SVC(random_state=100)
m5 = ExtraTreesClassifier(random_state=100)


samples = {
    'S1': sm1,
    'S2': sm2,
    'S3': sm3,
    'S4': sm4,
    'S5': sm5,
}
models = {
    'M1': m1,
    'M2': m2,
    'M3': m3,
    'M4': m4,
    'M5': m5,
}


In [3]:
ans = {}
for sampler_name, sampler in samples.items():
    if sampler_name == 'S1':
        n = n1
    elif sampler_name == 'S2':
        n = n2
    elif sampler_name == 'S3':
        n = n3
    elif sampler_name == 'S4':
        n = n4
    else:
        n = n5

    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    
    if len(X_resampled) > n:
        X_resampled = X_resampled[:n]
        y_resampled = y_resampled[:n]
    
    for model_name, model in models.items():
        model.fit(X_resampled, y_resampled)
        
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        
        if model_name in ans:
            ans[model_name][sampler_name] = accuracy
        else:
            ans[model_name] = {sampler_name: accuracy}


In [4]:
            
print('Results:')
print('\tSampling1   Sampling2   Sampling3   Sampling4   Sampling5')
for model_name, model_results in ans.items():
    print(model_name, end='')
    for sampler_name in samples.keys():
        if sampler_name in model_results:
            print(f'    {model_results[sampler_name]:.4f}   ', end='')
        else:
            print('\t\t', end='')
    print() 

Results:
	Sampling1   Sampling2   Sampling3   Sampling4   Sampling5
M1    0.4129       0.9806       0.9806       0.9806       0.9806   
M2    0.5032       0.9806       0.9806       0.9806       0.9677   
M3    0.6065       0.9806       0.9806       0.9806       0.9806   
M4    0.6387       0.9806       0.9806       0.9806       0.9806   
M5    0.7226       0.9806       0.9806       0.9806       0.9806   
