In [77]:
import pandas as pd
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('Creditcard_data.csv')
class_count_0, class_count_1 = data['Class'].value_counts()

class_0 = data[data['Class'] == 0]
class_1 = data[data['Class'] == 1]
print('class 0:', class_0.shape)
print('class 1:', class_1.shape)

class 0: (763, 31)
class 1: (9, 31)


In [46]:
X=data.drop('Class',axis=1)
y=data['Class']

In [80]:
# Random Under-Sampling With Imblearn
def random_undersampler(x, y):
  rus = RandomUnderSampler(random_state=42, replacement=True)
  x_rus, y_rus = rus.fit_resample(x, y)
  return x_rus, y_rus

In [81]:
# Random Over-Sampling With imblearn
def random_oversampler(x, y):
  ros = RandomOverSampler(random_state=42)
  x_ros, y_ros = ros.fit_resample(x, y)
  return x_ros, y_ros


In [82]:
# Under-Sampling: Tomek Links
def tomek_links_sampling(x, y):
  tl = TomekLinks()
  x_tl, y_tl = tl.fit_resample(x, y)
  return x_tl, y_tl

In [83]:
# Synthetic Minority Oversampling Technique (SMOTE)
def smote_sampling(x, y):
  smote = SMOTE()
  x_smote, y_smote = smote.fit_resample(x, y)
  return x_smote, y_smote

In [84]:
# NearMiss
def nearmiss_sampling(x, y):
  nm = NearMiss()
  x_nm, y_nm = nm.fit_resample(x, y)
  return x_nm, y_nm

In [85]:
def logistic_regression(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  model = LogisticRegression()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

In [86]:
def decision_tree(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  model = DecisionTreeClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

In [87]:
def random_forest(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  model = RandomForestClassifier()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

In [88]:
def support_vector_machine(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  model = SVC()
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

In [89]:
def k_nearest_neighbors(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  model = KNeighborsClassifier(n_neighbors=3)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  return accuracy_score(y_test, y_pred)

In [95]:
results_matrix = []
rows=["undersampler","oversampler","tomek","smote","nearmiss"]
columns=["LR","DT","RF","SVM","KNN"]
for i, sampling_method in enumerate([random_undersampler,random_oversampler,tomek_links_sampling,smote_sampling,nearmiss_sampling]):
    row=[]
    for j, model in enumerate([logistic_regression, decision_tree, random_forest, support_vector_machine, k_nearest_neighbors]):

        sampled_X,sampled_y = sampling_method(X,y)

        accuracy = model(sampled_X, sampled_y)

        row.append(accuracy)
    results_matrix.append(row)

df = pd.DataFrame(results_matrix,columns=columns,index=list(rows))
print(df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                    LR        DT        RF       SVM       KNN
undersampler  0.833333  0.500000  0.500000  0.166667  0.833333
oversampler   0.908297  0.986900  1.000000  0.685590  0.991266
tomek         0.986957  0.973913  0.986957  0.986957  0.986957
smote         0.917031  0.965066  0.993450  0.689956  0.836245
nearmiss      0.500000  0.166667  0.166667  0.166667  0.833333
