In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv("E:\projects\DMT\creditcard.csv")

In [8]:
class_val = df['Class'].value_counts()
print(f"Number of samples for each class :- \n {class_val}")
non_fraud = class_val[0]
fraud = class_val[1]
print(f"Non Fraudulent Numbers :- {non_fraud}")
print(f"Fraudulent Numbers :- {fraud}")

Number of samples for each class :- 
 0    284315
1       492
Name: Class, dtype: int64
Non Fraudulent Numbers :- 284315
Fraudulent Numbers :- 492


In [10]:
#equal both tsrget samples and take indexes
nonfraud_indexies = df[df.Class == 0].index
fraud_indices = np.array(df[df['Class'] == 1].index)
# take random samples from non fraudulent that are equal to fraudulent samples
random_normal_indexies = np.random.choice(nonfraud_indexies, fraud, replace=False)
random_normal_indexies = np.array(random_normal_indexies)

In [13]:
## Undersampling techniques
# concatenate both indices of fraud and non fraud
under_sample_indices = np.concatenate([fraud_indices, random_normal_indexies])

#extract all features from whole data for under sample indices only
under_sample_data = df.iloc[under_sample_indices, :]

# now we have to divide under sampling data to all features & target
x_undersample_data = under_sample_data.drop(['Class'], axis=1)
y_undersample_data = under_sample_data[['Class']]
# now split dataset to train and test datasets as before
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(
x_undersample_data, y_undersample_data, test_size=0.2, random_state=0)

In [14]:
## DecisionTreeClassifier after applying undersampling technique

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

def decision_tree_classification(X_train, y_train, X_test, y_test):
 # initialize object for DecisionTreeClassifier class
 dt_classifier = DecisionTreeClassifier()
 # train model by using fit method
 print("Model training start........")
 dt_classifier.fit(X_train, y_train.values.ravel())
 print("Model training completed")
 acc_score = dt_classifier.score(X_test, y_test)
 print(f'Accuracy of model on test dataset :- {acc_score}')
 # predict result using test dataset
 y_pred = dt_classifier.predict(X_test)
 # confusion matrix
 print(f"Confusion Matrix :- \n {confusion_matrix(y_test, y_pred)}")
 # classification report for f1-score
 print(f"Classification Report :- \n {classification_report(y_test, y_pred)}")
 print(f"AROC score :- \n {roc_auc_score(y_test, y_pred)}")

# calling decision tree classifier function 
decision_tree_classification(X_train_sample, y_train_sample, 
X_test_sample, y_test_sample)

Model training start........
Model training completed
Accuracy of model on test dataset :- 0.934010152284264
Confusion Matrix :- 
 [[99  7]
 [ 6 85]]
Classification Report :- 
               precision    recall  f1-score   support

           0       0.94      0.93      0.94       106
           1       0.92      0.93      0.93        91

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197

AROC score :- 
 0.9340140991084387


In [16]:
## RandomForestClassifier after apply the undersampling techniques

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

def random_forest_classifier(X_train, y_train, X_test, y_test):
 # initialize object for DecisionTreeClassifier class
 rf_classifier = RandomForestClassifier(n_estimators=50)
 # train model by using fit method
 print("Model training start........")
 rf_classifier.fit(X_train, y_train.values.ravel())
 acc_score = rf_classifier.score(X_test, y_test)
 print(f'Accuracy of model on test dataset :- {acc_score}')
 # predict result using test dataset
 y_pred = rf_classifier.predict(X_test)
 # confusion matrix
 print(f"Confusion Matrix :- \n {confusion_matrix(y_test, y_pred)}")
 # classification report for f1-score
 print(f"Classification Report :- \n {classification_report(y_test, y_pred)}")
 # area under roc curve
 print(f"AROC score :- \n {roc_auc_score(y_test, y_pred)}")

random_forest_classifier(X_train_sample, y_train_sample, X_test_sample, y_test_sample)

Model training start........
Accuracy of model on test dataset :- 0.9746192893401016
Confusion Matrix :- 
 [[105   1]
 [  4  87]]
Classification Report :- 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       106
           1       0.99      0.96      0.97        91

    accuracy                           0.97       197
   macro avg       0.98      0.97      0.97       197
weighted avg       0.98      0.97      0.97       197

AROC score :- 
 0.9733049968899026
