<a href="https://colab.research.google.com/github/arraakularavind/AspireNex/blob/main/credit_card_fraud_detection/credit_card_fraud_detection_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install tabulate
#!pip install ipywidgets voila
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,roc_auc_score,precision_score,accuracy_score,recall_score,f1_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from collections import Counter
from google.colab import drive
from tabulate import tabulate
import ipywidgets as widgets
from IPython.display import display,clear_output


def preprocess_clean(data):
  print(data["Class"].value_counts())
  print("\nMissing values in target variable = ",data["Class"].isna().sum())

  imputer=SimpleImputer(strategy="median")  #handling null values in class column
  data_imputed=imputer.fit_transform(data)
  data=pd.DataFrame(data_imputed,columns=data.columns)

  x=data.drop("Class",axis=1) #axis=1 to drop column "class"
  y=data["Class"]

  scaler=RobustScaler()
  x_scaling=scaler.fit_transform(x)  #fit_transform-->using median, interquantile range(IQR) for scaling and transform them to [-1 to 1]

  x_train,x_test,y_train,y_test=train_test_split(x_scaling,y,test_size=0.2,random_state=42,stratify=y)  #stratify--> the train and test set have same proportion over class(Label)
                                                                                        #instead SMOTE can be used--> when dataset is overfitted.

  smote=SMOTE(random_state=42,sampling_strategy="minority")      # random_state to ensure that split is reproduciblity  and control shuffling process
  x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)  # even distribution of class column by generating synethtic  sample

  print("\nBalance Over Train [Class] Dataset Completed")

  smote_test=SMOTE(random_state=42,sampling_strategy="minority") #balance over test_dataset too.
  x_test_smote,y_test_smote=smote_test.fit_resample(x_test,y_test)
  print("\nBalance over Test [Class] Dataset Completed\n")


  return x_train_smote,y_train_smote,x_test_smote,y_test_smote

def RandomForest(data):

  x_train_smote,y_train_smote,x_test_smote,y_test_smote=preprocess_clean(data)

  rf=RandomForestClassifier(criterion="gini",n_estimators=50,random_state=42,max_depth=10,min_samples_split=5,min_samples_leaf=1,bootstrap=True) #These parameter are found using hyperparameter combination using either GridSearchCV and RandomSearchCV
  rf.fit(x_train_smote,y_train_smote)
  print("\nPerforming Prediction\n")
  predict_rf=rf.predict(x_test_smote)

  c_report=classification_report(y_test_smote,predict_rf)
  roc_aoc=roc_auc_score(y_test_smote,rf.predict_proba(x_test_smote)[:,1]) # select all row from column 2
  # evaluating classification performance at threshold level ROC-->Receiver operating Classification area under curve useful during plotting.

  accuracy=accuracy_score(y_test_smote,predict_rf)
  precision=precision_score(y_test_smote,predict_rf)
  recall=recall_score(y_test_smote,predict_rf)
  f1=f1_score(y_test_smote,predict_rf)

  print("\nRondom Forest Classification Report\n")
  print("Verify support for balance of Dataset over Class\n")

  print(c_report)

  return roc_aoc,accuracy,classification_report(y_test_smote,predict_rf,output_dict=True)

def Decision_Tree(data):

  x_train_smote,y_train_smote,x_test_smote,y_test_smote=preprocess_clean(data)

  dt=DecisionTreeClassifier(random_state=42,min_samples_leaf=1,min_samples_split=2,max_depth=100,criterion="gini",max_features="sqrt")
  dt.fit(x_train_smote,y_train_smote)
  #print(dt.get_params())
  predict_dt=dt.predict(x_test_smote)

  c_report=classification_report(y_test_smote,predict_dt)
  roc_aoc=roc_auc_score(y_test_smote,dt.predict_proba(x_test_smote)[:,1]) # select all row from column 2
  # evaluating classification performance at threshold level ROC-->Receiver operating Classification area under curve useful during plotting.

  accuracy=accuracy_score(y_test_smote,predict_dt)
  precision=precision_score(y_test_smote,predict_dt)
  recall=recall_score(y_test_smote,predict_dt)
  f1=f1_score(y_test_smote,predict_dt)

  print("\nDecision Tree Classification Report\n")
  print("Verify support for balance of Dataset over Class\n")

  print(c_report)

  return roc_aoc,accuracy,classification_report(y_test_smote,predict_dt,output_dict=True)


def Logistic_Regression(data):

  x_train_smote,y_train_smote,x_test_smote,y_test_smote=preprocess_clean(data)
  lr=LinearRegression()
  lr.fit(x_train_smote,y_train_smote)

  predict_lr=lr.predict(x_test_smote)

  predict_lr_bin=(predict_lr>=0.5).astype(int) #convert the output to binary classification labels using 0.5 as threshold

  print("\nLogistic Regression Classification_report\n")
  print("Verify support for balance of Dataset over Class\n")

  print(classification_report(y_test_smote,predict_lr_bin))

  roc_auc=roc_auc_score(y_test_smote,predict_lr) # select all row from column 2

  accuracy=accuracy_score(y_test_smote,predict_lr_bin)
  precision=precision_score(y_test_smote,predict_lr_bin)
  recall=recall_score(y_test_smote,predict_lr_bin)

  return roc_auc,accuracy,classification_report(y_test_smote,predict_lr_bin,output_dict=True)




drive.mount("/content/drive",force_remount=True)
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/creditcard.csv").sample(frac=1)


result={"Random Forest":[],"Decision Tree":[],"Logistic Regression":[]}

clear_output()

def display_metrics(model_name,prediction):
  table=[]

  table.append(["ROC-AUC",prediction[0]])
  table.append(["ACCURACY",prediction[1]])

  table.append(["PRECISION----->Valid_Transaction",prediction[2]["0.0"]["precision"]])
  table.append(["PRECISION----->Fraud_Transaction",prediction[2]["1.0"]["precision"]])
  table.append(["RECALL-------->Valid_Transaction", prediction[2]["0.0"]["recall"]])
  table.append(["RECALL-------->Fraud_Transaction",prediction[2]["1.0"]["recall"]])
  table.append(["F1_SCORE------>Valid_Transaction",prediction[2]["0.0"]["f1-score"]])
  table.append(["F1_SCORE------>Fraud_Transaction",prediction[2]["1.0"]["f1-score"]])

  #Formating the table in displaying

  table_html = "<div style='float: left; margin-right: 20px;'>"
  table_html += "<table style='border-collapse: collapse; text-align: left;'>"
  table_html += "<tr><th style='border: 1px solid black; padding: 8px;'>Parameter</th><th style='border: 1px solid black; padding: 8px;'>Value</th></tr>"
  for row in table:
      table_html += "<tr>"
      for col in row:
          table_html += f"<td style='border: 1px solid black; padding: 8px;'>{col}</td>"
      table_html += "</tr>"
  table_html += "</table>"
  table_html += "</div>"

  display(widgets.HTML(f"<h1>{model_name} Metrics</h1>"))
  display(widgets.HTML(table_html))

def on_button_clicked(model_name):
  if model_name=="Random Forest":
    prediction=RandomForest(data)
  elif model_name=="Decision Tree":
    prediction=Decision_Tree(data)
  elif model_name=="Logistic Regression":
    prediction=Logistic_Regression(data)
  else:
    return False
  display_metrics(model_name,prediction)
  print("\n")
  display(widgets.HBox([random_button,decision_button,logistic_button,thank]))

#Button info.
random_button=widgets.Button(description="Random Forest")
decision_button=widgets.Button(description="Decision Tree")
logistic_button=widgets.Button(description="Logistic Regression")
thank=widgets.Button(description="None")

#Button Display
display(widgets.HBox([random_button,decision_button,logistic_button,thank]))
random_button.on_click(lambda x: on_button_clicked("Random Forest"))
decision_button.on_click(lambda x: on_button_clicked("Decision Tree"))
logistic_button.on_click(lambda x: on_button_clicked("Logistic Regression"))
thank.on_click(lambda x: clear_output())



HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

Class
0    284315
1       492
Name: count, dtype: int64

Missing values in target variable =  0

Balance Over Train [Class] Dataset Completed

Balance over Test [Class] Dataset Completed


Performing Prediction


Rondom Forest Classification Report

Verify support for balance of Dataset over Class

              precision    recall  f1-score   support

         0.0       0.87      1.00      0.93     56864
         1.0       1.00      0.85      0.92     56864

    accuracy                           0.92    113728
   macro avg       0.93      0.92      0.92    113728
weighted avg       0.93      0.92      0.92    113728



HTML(value='<h1>Random Forest Metrics</h1>')

HTML(value="<div style='float: left; margin-right: 20px;'><table style='border-collapse: collapse; text-align:…





HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

Class
0    284315
1       492
Name: count, dtype: int64

Missing values in target variable =  0

Balance Over Train [Class] Dataset Completed

Balance over Test [Class] Dataset Completed


Decision Tree Classification Report

Verify support for balance of Dataset over Class

              precision    recall  f1-score   support

         0.0       0.84      1.00      0.91     56864
         1.0       1.00      0.80      0.89     56864

    accuracy                           0.90    113728
   macro avg       0.92      0.90      0.90    113728
weighted avg       0.92      0.90      0.90    113728



HTML(value='<h1>Decision Tree Metrics</h1>')

HTML(value="<div style='float: left; margin-right: 20px;'><table style='border-collapse: collapse; text-align:…





HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

Class
0    284315
1       492
Name: count, dtype: int64

Missing values in target variable =  0

Balance Over Train [Class] Dataset Completed

Balance over Test [Class] Dataset Completed


Logistic Regression Classification_report

Verify support for balance of Dataset over Class

              precision    recall  f1-score   support

         0.0       0.86      0.99      0.92     56864
         1.0       0.98      0.84      0.90     56864

    accuracy                           0.91    113728
   macro avg       0.92      0.91      0.91    113728
weighted avg       0.92      0.91      0.91    113728



HTML(value='<h1>Logistic Regression Metrics</h1>')

HTML(value="<div style='float: left; margin-right: 20px;'><table style='border-collapse: collapse; text-align:…





HBox(children=(Button(description='Random Forest', style=ButtonStyle()), Button(description='Decision Tree', s…

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,precision_recall_fscore_support,roc_auc_score,precision_score,accuracy_score,recall_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from collections import Counter
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/creditcard.csv",nrows=10000).sample(frac=1,random_state=42)

print(data["Class"].value_counts())
print("Missing values in target variable",data["Class"].isna().sum())

imputer=SimpleImputer(strategy="median")
data_imputed=imputer.fit_transform(data)
data=pd.DataFrame(data_imputed,columns=data.columns)


x=data.drop("Class",axis=1) #axis=1 to drop column "class"
y=data["Class"]

scaler=RobustScaler()
x_scaling=scaler.fit_transform(x)  #fit_transform-->includes both picking mini. and maxi value from dataset and transform them to [0-1]

x_train,x_test,y_train,y_test=train_test_split(x_scaling,y,test_size=0.2,random_state=42,stratify=y)  #stratify--> the train and test set have same proportion over class(Label)
                                                                                        #instead SMOTE can be used--> when dataset is overfitted.

smote=SMOTE(random_state=42)      # random_state to ensure that split is reproduciblity  and control shuffling process
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)  # even distribution of class column by generating synethtic  sample

param_grid={"n_estimators":[50,100,200],"max_features":["auto","sqrt","log2"],"max_depth":[10,50,100],"min_samples_split":[2,5,10],"min_samples_leaf":[1,2,4],"bootstrap":[True,False]}

rf=RandomForestClassifier(criterion="gini",random_state=42)
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=3,n_jobs=-1,verbose=2)
grid_search.fit(x_train_smote,y_train_smote)

smote_test=SMOTE(random_state=42,sampling_strategy="minority")
x_test_smote,y_test_smote=smote_test.fit_resample(x_test,y_test)


print(grid_search.best_params_)

best_rf=grid_search.best_estimator_
predict_rf=best_rf.predict(x_test_smote)

print("Random Forest Report")
print(classification_report(y_test_smote,predict_rf))
print("ROC-AUC-->",roc_auc_score(y_test_smote,best_rf.predict_proba(x_test_smote)[:,1])) # select all row from column 2
# evaluating classification performance at threshold levl ROC-->Receiver operating Classification area under curve useful during plotting.


print("Refrence")
print("Accuracy:",accuracy_score(y_test_smote,predict_rf))
print("Precision:",precision_score(y_test_smote,predict_rf))
print("Recall:",recall_score(y_test_smote,predict_rf))

Mounted at /content/drive
Class
0    9962
1      38
Name: count, dtype: int64
Missing values in target variable 0
Fitting 3 folds for each of 486 candidates, totalling 1458 fits


  warn(


{'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Random Forest Report
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      1992
         1.0       1.00      0.96      0.98      1992

    accuracy                           0.98      3984
   macro avg       0.98      0.98      0.98      3984
weighted avg       0.98      0.98      0.98      3984

ROC-AUC--> 0.9999769408961147
Refrence
Accuracy: 0.9796686746987951
Precision: 1.0
Recall: 0.9593373493975904
