<a href="https://colab.research.google.com/github/arraakularavind/AspireNex/blob/main/credit_card_fraud_detection/creditcard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
#!pip install tabulate
#!pip install ipywidgets voila
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,roc_auc_score,precision_score,accuracy_score,recall_score,f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from collections import Counter
from google.colab import drive
from tabulate import tabulate
import ipywidgets as widgets
from IPython.display import display,clear_output


def preprocess_clean(data):

  valid_data=data[data.Class==0]
  fraud_data=data[data.Class==1]

  print(data["Class"].value_counts())
  print("\n Missing Value Handling")

  #check for missing values sum>1: present else no handling required
  total_miss=data.isnull().sum().sort_values(ascending=False)
  percent_miss=(data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)

  missing_data=pd.concat([total_miss,percent_miss],axis=1,keys=["total_miss","percent_miss"])
  missing_data[missing_data["total_miss"]>0]

  #performing undersampling to balance the classes
  valid_sample=valid_data.sample(n=len(fraud_data),random_state=42)
  new_data=pd.concat([valid_sample,fraud_data],axis=0) #creating new dataset containing fraud and valid transaction

  x=new_data.drop("Class",axis=1) #axis=1 to drop column "class"
  y=new_data["Class"]

  scaler=RobustScaler()
  x_scaling=scaler.fit_transform(x)  #fit_transform-->using median, interquantile range(IQR) for scaling and transform them to [-1 to 1]

  x_train,x_test,y_train,y_test=train_test_split(x_scaling,y,test_size=0.2,random_state=42,stratify=y)  #stratify--> the train and test set have same proportion over class(Label)
                                                                                        #instead SMOTE can be used--> when dataset is overfitted.
  print("\nBalance Over Train [Class] Dataset Completed")

  return x_train,y_train,x_test,y_test

def RandomForest(data):

  x_train_smote,y_train_smote,x_test,y_test=preprocess_clean(data)

  rf=RandomForestClassifier(criterion="gini",n_estimators=200,random_state=42,max_depth=50,min_samples_split=2,min_samples_leaf=1,bootstrap=True,max_features="log2") #These parameter are found using hyperparameter combination using either GridSearchCV and RandomSearchCV
  rf.fit(x_train_smote,y_train_smote)
  print("\nPerforming Prediction\n")
  predict_rf=rf.predict(x_test)

  c_report=classification_report(y_test,predict_rf)
  roc_aoc=roc_auc_score(y_test,rf.predict_proba(x_test)[:,1]) # select all row from column 2
  # evaluating classification performance at threshold level ROC-->Receiver operating Classification area under curve useful during plotting.

  accuracy=accuracy_score(y_test,predict_rf)
  precision=precision_score(y_test,predict_rf)
  recall=recall_score(y_test,predict_rf)
  f1=f1_score(y_test,predict_rf)

  print("\nRondom Forest Classification Report\n")

  print(c_report)

  return roc_aoc,accuracy,classification_report(y_test,predict_rf,output_dict=True)

def Decision_Tree(data):

  x_train_smote,y_train_smote,x_test,y_test=preprocess_clean(data)

  dt=DecisionTreeClassifier(random_state=42,min_samples_leaf=1,min_samples_split=2,max_depth=50,criterion="gini",max_features="log2")
  dt.fit(x_train_smote,y_train_smote)
  #print(dt.get_params())
  predict_dt=dt.predict(x_test)

  c_report=classification_report(y_test,predict_dt)
  roc_aoc=roc_auc_score(y_test,dt.predict_proba(x_test)[:,1]) # select all row from column 2
  # evaluating classification performance at threshold level ROC-->Receiver operating Classification area under curve useful during plotting.

  accuracy=accuracy_score(y_test,predict_dt)
  precision=precision_score(y_test,predict_dt)
  recall=recall_score(y_test,predict_dt)
  f1=f1_score(y_test,predict_dt)

  print("\nDecision Tree Classification Report\n")

  print(c_report)

  return roc_aoc,accuracy,classification_report(y_test,predict_dt,output_dict=True)


def Logistic_Regression(data):

  x_train_smote,y_train_smote,x_test,y_test=preprocess_clean(data)
  lr=LogisticRegression(max_iter=1000)
  lr.fit(x_train_smote,y_train_smote)

  predict_lr=lr.predict(x_test)

  predict_lr_bin=(predict_lr>=0.5).astype(int) #convert the output to binary classification labels using 0.5 as threshold

  print("\nLogistic Regression Classification_report\n")

  print(classification_report(y_test,predict_lr_bin))

  roc_auc=roc_auc_score(y_test,predict_lr) # select all row from column 2

  accuracy=accuracy_score(y_test,predict_lr_bin)
  precision=precision_score(y_test,predict_lr_bin)
  recall=recall_score(y_test,predict_lr_bin)

  return roc_auc,accuracy,classification_report(y_test,predict_lr_bin,output_dict=True)




drive.mount("/content/drive",force_remount=True)
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/creditcard.csv").sample(frac=1)


result={"Random Forest":[],"Decision Tree":[],"Logistic Regression":[]}

clear_output()

def display_metrics(model_name,prediction):
  table=[]

  table.append(["ROC-AUC",prediction[0]])
  table.append(["ACCURACY",prediction[1]])

  table.append(["PRECISION----->Valid_Transaction",prediction[2]["0"]["precision"]])
  table.append(["PRECISION----->Fraud_Transaction",prediction[2]["1"]["precision"]])
  table.append(["RECALL-------->Valid_Transaction", prediction[2]["0"]["recall"]])
  table.append(["RECALL-------->Fraud_Transaction",prediction[2]["1"]["recall"]])
  table.append(["F1_SCORE------>Valid_Transaction",prediction[2]["0"]["f1-score"]])
  table.append(["F1_SCORE------>Fraud_Transaction",prediction[2]["1"]["f1-score"]])

  #Formating the table in displaying

  table_html = "<div style='float: left; margin-right: 20px;'>"
  table_html += "<table style='border-collapse: collapse; text-align: left;'>"
  table_html += "<tr><th style='border: 1px solid black; padding: 8px;'>Parameter</th><th style='border: 1px solid black; padding: 8px;'>Value</th></tr>"
  for row in table:
      table_html += "<tr>"
      for col in row:
          table_html += f"<td style='border: 1px solid black; padding: 8px;'>{col}</td>"
      table_html += "</tr>"
  table_html += "</table>"
  table_html += "</div>"

  display(widgets.HTML(f"<h1>{model_name} Metrics</h1>"))
  display(widgets.HTML(table_html))

def on_button_clicked(model_name):
  if model_name=="Random Forest":
    prediction=RandomForest(data)
  elif model_name=="Decision Tree":
    prediction=Decision_Tree(data)
  elif model_name=="Logistic Regression":
    prediction=Logistic_Regression(data)
  else:
    return False
  display_metrics(model_name,prediction)
  print("\n")
  display(widgets.HBox([random_button,decision_button,logistic_button,thank]))

#Button info.
random_button=widgets.Button(description="Random Forest")
decision_button=widgets.Button(description="Decision Tree")
logistic_button=widgets.Button(description="Logistic Regression")
thank=widgets.Button(description="None")

#Button Display
display(widgets.HBox([random_button,decision_button,logistic_button,thank]))
random_button.on_click(lambda x: on_button_clicked("Random Forest"))
decision_button.on_click(lambda x: on_button_clicked("Decision Tree"))
logistic_button.on_click(lambda x: on_button_clicked("Logistic Regression"))
thank.on_click(lambda x: clear_output())



HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

Class
0    284315
1       492
Name: count, dtype: int64

 Missing Value Handling

Balance Over Train [Class] Dataset Completed

Logistic Regression Classification_report

              precision    recall  f1-score   support

           0       0.90      0.97      0.93        99
           1       0.97      0.89      0.93        98

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197



HTML(value='<h1>Logistic Regression Metrics</h1>')

HTML(value="<div style='float: left; margin-right: 20px;'><table style='border-collapse: collapse; text-align:…





HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

Class
0    284315
1       492
Name: count, dtype: int64

 Missing Value Handling

Balance Over Train [Class] Dataset Completed

Performing Prediction


Rondom Forest Classification Report

              precision    recall  f1-score   support

           0       0.89      0.98      0.93        99
           1       0.98      0.88      0.92        98

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197



HTML(value='<h1>Random Forest Metrics</h1>')

HTML(value="<div style='float: left; margin-right: 20px;'><table style='border-collapse: collapse; text-align:…





HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

Class
0    284315
1       492
Name: count, dtype: int64

 Missing Value Handling

Balance Over Train [Class] Dataset Completed

Decision Tree Classification Report

              precision    recall  f1-score   support

           0       0.85      0.91      0.88        99
           1       0.90      0.84      0.87        98

    accuracy                           0.87       197
   macro avg       0.88      0.87      0.87       197
weighted avg       0.87      0.87      0.87       197



HTML(value='<h1>Decision Tree Metrics</h1>')

HTML(value="<div style='float: left; margin-right: 20px;'><table style='border-collapse: collapse; text-align:…





HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,precision_recall_fscore_support,roc_auc_score,precision_score,accuracy_score,recall_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from collections import Counter
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/creditcard.csv").sample(frac=1,random_state=42)

valid_data=data[data.Class==0]
fraud_data=data[data.Class==1]

print(data["Class"].value_counts())
print("\n Missing Value Handling")

#check for missing values sum>1: present else no handling required
#determine total missing values
total_miss=data.isnull().sum().sort_values(ascending=False)
#determine total percentage of missing  values
percent_miss=(data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)

# combine them into single dataFrame
missing_data=pd.concat([total_miss,percent_miss],axis=1,keys=["total_miss","percent_miss"])
# determining missing value
missing_data[missing_data["total_miss"]>0]

#performing undersampling to balance the classes
valid_sample=valid_data.sample(n=len(fraud_data),random_state=42)
new_data=pd.concat([valid_sample,fraud_data],axis=0) #creating new dataset containing fraud and valid transaction

x=new_data.drop("Class",axis=1) #axis=1 to drop column "class"
y=new_data["Class"]

scaler=RobustScaler()
x_scaling=scaler.fit_transform(x)  #fit_transform-->using median, interquantile range(IQR) for scaling and transform them to [-1 to 1]

x_train,x_test,y_train,y_test=train_test_split(x_scaling,y,test_size=0.2,random_state=42,stratify=y)  #stratify--> the train and test set have same proportion over class(Label)

param_grid={"n_estimators":[50,100,200],"max_features":["auto","sqrt","log2"],"max_depth":[10,50,100],"min_samples_split":[2,5,10],"min_samples_leaf":[1,2,4],"bootstrap":[True,False]}

rf=RandomForestClassifier(criterion="gini",random_state=42)
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
grid_search.fit(x_train,y_train)

print(grid_search.best_params_)

best_rf=grid_search.best_estimator_
predict_rf=best_rf.predict(x_test)

print("Random Forest Report")
print(classification_report(y_test,predict_rf))
print("ROC-AUC-->",roc_auc_score(y_test,best_rf.predict_proba(x_test)[:,1])) # select all row from column 2
# evaluating classification performance at threshold levl ROC-->Receiver operating Classification area under curve useful during plotting.


print("Refrence")
print("Accuracy:",accuracy_score(y_test,predict_rf))
print("Precision:",precision_score(y_test,predict_rf))
print("Recall:",recall_score(y_test,predict_rf))

Mounted at /content/drive
Class
0    284315
1       492
Name: count, dtype: int64

 Missing Value Handling
Fitting 3 folds for each of 486 candidates, totalling 1458 fits
{'bootstrap': True, 'max_depth': 50, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest Report
              precision    recall  f1-score   support

           0       0.92      0.97      0.95        99
           1       0.97      0.92      0.94        98

    accuracy                           0.94       197
   macro avg       0.95      0.94      0.94       197
weighted avg       0.95      0.94      0.94       197

ROC-AUC--> 0.98433312719027
Refrence
Accuracy: 0.9441624365482234
Precision: 0.967741935483871
Recall: 0.9183673469387755
