# No Remedy for Class Imbalance

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score 
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [None]:
data=pd.read_csv('../input/creditcardfraud/creditcard.csv')
plt.bar(['Valid','Fraud'],list(data['Class'].value_counts()))
print("Fraudulent transactions: ", end='')
frauds= data['Class'].value_counts()[1]/sum(data['Class'].value_counts())
print(round(frauds*100,2), end='%')
plt.show()

In [None]:
fig=plt.figure(figsize= (12, 12)) 
sns.heatmap(data.corr(), cmap='Blues')
plt.show()

In [None]:
X =data.iloc[:,:-1]
Y =data.iloc[:,-1]
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
classifier=RandomForestClassifier() 
classifier.fit(X_train, Y_train) 
Y_pred=classifier.predict(X_test) 

In [None]:
print("Model Accuracy:", round(accuracy_score(Y_test, Y_pred),4))
print("Model Precision:", round(precision_score(Y_test, Y_pred),4))
print("Model Recall:", round(recall_score(Y_test, Y_pred),4))

In [None]:
labels= ['Valid', 'Fraud'] 
conf_matrix=confusion_matrix(Y_test, Y_pred) 
plt.figure(figsize=(6, 6)) 
sns.heatmap(conf_matrix, xticklabels= labels, yticklabels= labels, annot=True, fmt="d")
plt.title("Random Forest Classifier - Confusion Matrix") 
plt.ylabel('True Value') 
plt.xlabel('Predicted Value') 
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

# Undersampling

In [None]:
import imblearn
from collections import Counter

In [None]:
df2 = pd.read_csv('../input/creditcardfraud/creditcard.csv')

x2 = df2.drop("Class",axis=1)
y2 = df2['Class']

print("Original values in creditcard data: %s" % Counter(y2))

#applying under sampling
from imblearn.under_sampling import RandomUnderSampler
random_under_sample = RandomUnderSampler(random_state=42)
x_new, y_new = random_under_sample.fit_resample(x2, y2)
print("Values after applying undersampling: %s" % Counter(y_new))

#plotting
fig, axarr = plt.subplots(1, 2, figsize = (8, 4))
df2.Class.value_counts().plot.bar(ax=axarr[0])
y_new.value_counts().plot.bar(ax=axarr[1])
axarr[0].set_title("Before Undersampling")
axarr[1].set_title("After Undersampling");

In [None]:
print("Original values in creditcard data: %s" % Counter(y2))

random_under_sample = RandomUnderSampler(sampling_strategy = 0.5, random_state=42)
x_with_ratio, y_with_ratio = random_under_sample.fit_resample(x2, y2)
print("Values after applying undersampling with ration 0.5: %s" % Counter(y_with_ratio))

#plotting
fig, axarr = plt.subplots(1, 2, figsize = (8, 4))
df2.Class.value_counts().plot.bar(ax=axarr[0])
y_with_ratio.value_counts().plot.bar(ax=axarr[1])
axarr[0].set_title("Before Undersampling")
axarr[1].set_title("After Undersampling with ratio 0.5");

# Oversampling

In [None]:
from imblearn.over_sampling import SMOTE

#applying smote
smote = SMOTE()
x_sm, y_sm = smote.fit_resample(x2, y2)

print("Original values in ucl data: %s" % Counter(Y))

print("Values after applying oversampling: %s" % Counter(y_sm))

#plotting
fig, axarr = plt.subplots(1, 2, figsize = (8, 4))
data.Class.value_counts().plot.bar(ax=axarr[0])
y_sm.value_counts().plot.bar(ax=axarr[1])
axarr[0].set_title("Before Oversampling")
axarr[1].set_title("After Oversampling");

In [None]:
#applying ml on balanced data
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size = 0.25, random_state = 7)

clf = RandomForestClassifier(random_state = 31)

# Fit random hyperparameter search model
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)

print("Model accuracy in data after applying SMOTE technique is: " + str(round((score * 100), 2)) + '%')

In [None]:
y_preds = clf.predict(x_test)
print(classification_report(y_test, y_preds))

# Combining oversampling and undersampling

In [None]:
from imblearn.combine import SMOTEENN 

sme = SMOTEENN(random_state=42)
x_res, y_res = sme.fit_resample(x2, y2)

print("Original values in ucl data: %s" % Counter(y2))

print("Values after applying combinesampling: %s" % Counter(y_res))

#plotting
fig, axarr = plt.subplots(1, 2, figsize = (8, 4))
data.Class.value_counts().plot.bar(ax=axarr[0])
y_res.value_counts().plot.bar(ax=axarr[1])
axarr[0].set_title("Before combinesampling")
axarr[1].set_title("After aftersampling");

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size = 0.25, random_state = 7)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state = 31)

# Fit random hyperparameter search model
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)

print("Model accuracy after applying Combine-sampling technique is: " + str(round((score * 100), 2)) + '%')

In [None]:
y_preds = clf.predict(x_test)
print(classification_report(y_test, y_preds))