In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
data = pd.read_csv('/kaggle/input/pragyanaiprojectthon-payment-fraud-data/payment_fraud.csv')
print(data.head())



In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.columns

# categorical value

In [None]:
data.paymentMethod.unique()

In [None]:
data.paymentMethod.value_counts()

# converting categorical to numerical

In [None]:
data.paymentMethod.replace('paypal',0,inplace=True) # Online
data.paymentMethod.replace('storecredit',1,inplace=True) # Card Based 
data.paymentMethod.replace('creditcard',1,inplace=True) # Card Based

In [None]:
data.info()

# EDA

In [None]:
sns.pairplot(data, hue='label') 
plt.show()



# check class balance or nor

In [None]:
plt.figure(figsize=(8,6)) 
fig, ax = plt.subplots()
plt.title('Distribution of Target', size=18)
sns.countplot(x=data['label']) 
target_count = data.label.value_counts()
ax.annotate(target_count[0], xy=(-0.04,10+target_count[0]), size=14)
ax.annotate(target_count[1], xy=(0.96,10+target_count[1]), size=14)
plt.ylim(0,40000)
plt.show()

In [None]:
# import library import imblearn 
from imblearn.over_sampling import RandomOverSampler 
x = data.drop('label', axis =1) 
y=data['label']
target_count = data.label.value_counts()
ros = RandomOverSampler(random_state=42)
x_rus, y_rus = ros.fit_resample(x, y)
modified_count = y_rus.value_counts()
print('original dataset shape:',target_count[0],target_count[1] ) 
print('Resample dataset shape', modified_count[0],modified_count[1])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_rus, y_rus , test_size=0.3, random_state=42)

In [None]:
scalar = StandardScaler()
scalar.fit(x_train) 
x_train = scalar.transform(x_train)
x_test = scalar.transform(x_test)

In [None]:
clf = LogisticRegression()
clf.fit(x_train, y_train)

In [None]:
y_pred = np.array(clf.predict(x_test)) 
conf_mat = pd.DataFrame(confusion_matrix(y_test, y_pred),
                        columns=["Pred.Negative", "Pred.Positive"],
                        index=['Act.Negative', "Act.Positive"])
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy = round((tn+tp)/(tn+fp+fn+tp), 4)
print(conf_mat)
print(f'\n Accuracy = {round(100*accuracy, 2)}%')

In [None]:
import scikitplot as skplt 
print("Scikit Plot Version : ", skplt.__version__)

In [None]:
skplt.estimators.plot_learning_curve(clf, x_train, y_train,
                                     cv=7, shuffle=True, scoring="accuracy",
                                     n_jobs=-1, figsize=(6,4), 
                                     title_fontsize="large", text_fontsize="large", title="Digits Classification Learning Curve");

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
rf_reg = RandomForestClassifier() 
rf_reg.fit(x_train, y_train) 
rf_reg.score(x_test, y_test)



In [None]:
gb_classif = GradientBoostingClassifier() 
gb_classif.fit(x_train, y_train)
gb_classif.score(x_test, y_test)

In [None]:
fig = plt.figure(figsize=(15,6)) 
ax1 = fig.add_subplot(121) 
features = ['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays',] 
skplt.estimators.plot_feature_importances(rf_reg, feature_names=features,
                                          title="Random Forest Regressor Feature Importance",
                                          x_tick_rotation=90, order="ascending",
                                          ax=ax1);
ax2 = fig.add_subplot(122) 
skplt.estimators.plot_feature_importances(gb_classif, feature_names=features,
                                          title="Gradient Boosting Classsifier Feature Importance",
                                          x_tick_rotation=90,
                                          ax=ax2);
plt.tight_layout()

In [None]:
Y_pred = clf.predict(x_test)
fig = plt.figure(figsize=(15,6))
ax1 = fig.add_subplot(121) 
skplt.metrics.plot_confusion_matrix(y_test, Y_pred,
                                    title="Confusion Matrix",
                                    cmap="Oranges", ax=ax1)
ax2 = fig.add_subplot(122) 
skplt.metrics.plot_confusion_matrix(y_test, Y_pred, 
                                    normalize=True, 
                                    title="Confusion Matrix",
                                    cmap="Purples", 
                                    ax=ax2)

In [None]:
Y_test_probs = clf.predict_proba(x_test)
skplt.metrics.plot_roc_curve(y_test, Y_test_probs, title="Digits ROC Curve", figsize=(12,6));

In [None]:
skplt.metrics.plot_precision_recall_curve(y_test, Y_test_probs, title="Digits Precision-Recall Curve", figsize=(12,6));

In [None]:
skplt.metrics.plot_cumulative_gain(y_test, Y_test_probs, figsize=(10,6));

In [None]:
method_names = [] 
method_scores = []

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
print("Score for Number of Neighbors = 3: {}".format(knn.score(x_test,y_test)))
method_names.append("KNN") 
method_scores.append(knn.score(x_test,y_test))
#Confusion Matrix
y_pred = knn.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred) 
#Visualization Confusion Matrix 
f, ax = plt.subplots(figsize=(5,5)) 
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
from sklearn.svm import SVC 
svm = SVC(random_state=42) 
svm.fit(x_train,y_train) 
print("SVM Classification Score is: {}".format(svm.score(x_test,y_test)))
method_names.append("SVM")
method_scores.append(svm.score(x_test,y_test)) 
#Confusion Matrix 
y_pred = svm.predict(x_test) 
conf_mat = confusion_matrix(y_test,y_pred) 
#Visualization Confusion Matrix 
f, ax = plt.subplots(figsize=(5,5)) 
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()


In [None]:
from sklearn.naive_bayes import GaussianNB
naive_bayes = GaussianNB() 
naive_bayes.fit(x_test,y_test)
print("Naive Bayes Classification Score: {}".format(naive_bayes.score(x_test,y_test)))
method_names.append("Naive Bayes")
method_scores.append(naive_bayes.score(x_test,y_test))
#Confusion Matrix
y_pred = naive_bayes.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier() 
dec_tree.fit(x_train,y_train)
print("Decision Tree Classification Score: ",dec_tree.score(x_test,y_test)) 
method_names.append("Decision Tree")
method_scores.append(dec_tree.score(x_test,y_test)) 
#Confusion Matrix
y_pred = dec_tree.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix 
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
rand_forest = RandomForestClassifier(n_estimators=100, random_state=42)
rand_forest.fit(x_train,y_train)
print("Random Forest Classification Score: ",rand_forest.score(x_test,y_test))
method_names.append("Random Forest")
method_scores.append(rand_forest.score(x_test,y_test))
#Confusion Matrix
y_pred = rand_forest.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5)) 
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
# Define Gradient Boosting Classifier with hyperparameters 
gbc=GradientBoostingClassifier(n_estimators=500,learning_rate=0.05,random_state=100,max_features=5 )
# Fit train data to GBC 
gbc.fit(x_train,y_train) 
print("Gradient Forest Classification Score: ",gbc.score(x_test,y_test)) 
method_names.append("Gradient Forest") 
method_scores.append(gbc.score(x_test,y_test)) 
#Confusion Matrix 
y_pred = gbc.predict(x_test) 
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5)) 
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()


In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
# Fit train data to GBC 
clf.fit(x_train,y_train)
print("Ada Boost Classification Score: ",clf.score(x_test,y_test)) 
method_names.append("Ada Boost")
method_scores.append(clf.score(x_test,y_test))
#Confusion Matrix 
y_pred = clf.predict(x_test) 
conf_mat = confusion_matrix(y_test,y_pred) 
#Visualization Confusion Matrix 
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()

In [None]:
# Import 
import xgboost as xgb
lr = 0.01
xgb_classifier = xgb.XGBClassifier(eta = lr) 
# Fit train data to GBC 
xgb_classifier.fit(x_train,y_train)
print("XGB Boost Classification Score: ",xgb_classifier.score(x_test,y_test))
method_names.append("XGB Boost")
method_scores.append(xgb_classifier.score(x_test,y_test)) 
#Confusion Matrix 
y_pred = xgb_classifier.predict(x_test) 
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix 
f, ax = plt.subplots(figsize=(5,5)) 
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, random_state=0) 
# Fit train data to GBC 
clf.fit(x_train,y_train)
print("ExtraTrees Classification Score: ",clf.score(x_test,y_test))
method_names.append("ExtraTrees Classifier")
method_scores.append(clf.score(x_test,y_test)) 
#Confusion Matrix 
y_pred = clf.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred) 
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()


In [None]:
import lightgbm as lgb 
clf = lgb.LGBMClassifier() 
# Fit train data to GBC 
clf.fit(x_train,y_train)
print("LightGBM Classification Score: ",clf.score(x_test,y_test))
method_names.append("LightGBM Classifier")
method_scores.append(clf.score(x_test,y_test))
#Confusion Matrix 
y_pred = clf.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()

In [None]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier( 
    iterations=5, 
    learning_rate=0.1, 
    loss_function='CrossEntropy' 
) 
# Fit train data to GBC 
clf.fit(x_train,y_train) 
print("CatBoost Classification Score: ",clf.score(x_test,y_test))
method_names.append("CatBoost Classifier")
method_scores.append(clf.score(x_test,y_test)) 
#Confusion Matrix
y_pred = clf.predict(x_test)
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix 
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax) 
plt.xlabel("Predicted Values") 
plt.ylabel("True Values")
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.ylim([0.75,1]) 
plt.bar(method_names,method_scores,width=0.75) 
plt.xlabel('Method Name') 
plt.ylabel('Method Score')

In [None]:
method_scores

In [None]:
combined = list(zip(method_scores, method_names))
sorted_combined = sorted(combined, key=lambda x: x[0],reverse=True) 
print(sorted_combined)
sorted_list1, sorted_list2 = zip(*sorted_combined) 
print(sorted_list1) 
print(sorted_list2)

In [None]:
max_scores = []
max_methods = [] 
for i,v in enumerate(sorted_list1): 
    if i <=5: 
        max_methods.append(sorted_list2[i])
        max_scores.append(sorted_list1[i])

In [None]:
plt.figure(figsize=(15,10))
plt.ylim([0.75,1]) 
plt.bar(max_methods,max_scores,width=0.5)
plt.xlabel('Method Name')
plt.ylabel('Method Score')

In [None]:
for i in range(len(max_methods)):
    print(max_methods[i],max_scores[i])

# Stacking classifier

In [None]:
from sklearn.ensemble import StackingClassifier
estimators = [ 
    ('lgb', lgb.LGBMClassifier()), 
    ('catb',CatBoostClassifier(iterations=5, learning_rate=0.1, loss_function='CrossEntropy')),
    ('xgb', xgb.XGBClassifier(eta = 0.01)), 
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
] 
clf = StackingClassifier(
    estimators=estimators, final_estimator=AdaBoostClassifier(n_estimators=100, random_state=0) )
clf.fit(x_train,y_train)
print("Stacking Classification Score: ",clf.score(x_test,y_test))
method_names.append("Stacking Classifier")
method_scores.append(clf.score(x_test,y_test))
#Confusion Matrix
y_pred = clf.predict(x_test) 
conf_mat = confusion_matrix(y_test,y_pred)
#Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5,5)) 
sns.heatmap(conf_mat,annot=True,linewidths=0.5,linecolor="red",fmt=".0f",ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
from sklearn.metrics import classification_report 
target_names = ['NotFraud', 'Fraud'] 
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
Y_test_probs = clf.predict_proba(x_test)
skplt.metrics.plot_roc_curve(y_test, Y_test_probs, 
                             title="Digits ROC Curve", figsize=(12,6));

In [None]:
skplt.metrics.plot_precision_recall_curve(y_test, Y_test_probs, 
                                          title="Digits Precision-Recall Curve", figsize=(12,6));

# another approach and explainable AI

# yellobrick

In [None]:
!pip install yellowbrick

In [None]:
data = pd.read_csv("/kaggle/input/pragyanaiprojectthon-payment-fraud-data/payment_fraud.csv")



In [None]:
data.paymentMethod.replace('paypal',0,inplace=True) 
data.paymentMethod.replace('storecredit',1,inplace=True)
data.paymentMethod.replace('creditcard',1,inplace=True)

In [None]:
# import library
import imblearn 
from imblearn.over_sampling import RandomOverSampler
x = data.drop('label', axis =1)
y=data['label'] 
target_count = data.label.value_counts() 
ros = RandomOverSampler(random_state=42)
x_rus, y_rus = ros.fit_resample(x, y) 
modified_count = y_rus.value_counts()
print('original dataset shape:',target_count[0],target_count[1] )
print('Resample dataset shape', modified_count[0],modified_count[1])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_rus, y_rus , test_size=0.3, random_state=42)

In [None]:
scalar = StandardScaler()
scalar.fit(x_train) 
x_train = scalar.transform(x_train)
x_test = scalar.transform(x_test)



In [None]:
from yellowbrick.classifier import ConfusionMatrix
from sklearn.ensemble import RandomForestClassifier
visualizer = ConfusionMatrix(RandomForestClassifier(random_state=1), classes=['NoFraud','Fraud']) 
visualizer.fit(x_train,y_train)
visualizer.score(x_test,y_test)
visualizer.show();

In [None]:
from yellowbrick.classifier import ClassificationReport 
from sklearn.tree import DecisionTreeClassifier 
viz = ClassificationReport(DecisionTreeClassifier(random_state=123),
                           classes=['NoFraud','Fraud'], 
                           support=True,
                           fig=plt.figure(figsize=(8,6)))
viz.fit(x_train,y_train)
viz.score(x_test,y_test)
viz.show();

# shap or shaply

In [None]:
!pip install shap

In [None]:
import shap 
import matplotlib.pyplot as plt
# load JS visualization code to notebook 
shap.initjs() 
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(x_train,y_train) 
# Create the explainer
explainer = shap.TreeExplainer(rf_clf) 
shap_values = explainer.shap_values(x_test)

In [None]:
data.columns

# Variable importance with summary plot

In [None]:
shap.summary.plot?

In [None]:
print("Variable Importance Plot - Global Interpretation") 
figure = plt.figure()
shap.summary_plot(shap_values,feature_names =['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays'], features=x_test)



In [None]:
# Class 1 - Fraud 
shap.summary_plot(shap_values[1],feature_names =['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays'], features=x_test)

In [None]:
# Class 0 - No Fraud
shap.summary_plot(shap_values[0],feature_names =['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays'], features=x_test)

# dependence plot

In [None]:
explainer = shap.TreeExplainer(rf_clf) 
shap_values = explainer.shap_values(x_test)

In [None]:
feature_index = 0 
shap.dependence_plot(feature_index, shap_values[0], features=x_test, feature_names=['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays'])

In [None]:
shap.dependence_plot(feature_index, shap_values[1], features=x_test, feature_names=['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays'])

# LIME

In [None]:
!pip install lime

In [None]:
# Import the LimeTabularExplainer module 
from lime.lime_tabular import LimeTabularExplainer
# Get the class names 
class_names = ['No Fraud', 'Fraud']
# Get the feature names 
feature_names = ['accountAgeDays', 'numItems', 'localTime', 'paymentMethod','paymentMethodAgeDays'] 
# Fit the Explainer on the training data set using the LimeTabularExplainer
explainer = LimeTabularExplainer(x_train, feature_names = 
                                 feature_names,
                                 class_names = class_names,
                                 mode = 'classification')

In [None]:
# Choose a specific instance from the test set 
instance_idx = 0 # Replace with the index of the instance you want to explain 
instance = x_test[0] 

# Explain the prediction for the chosen instance 
explanation = explainer.explain_instance(instance, rf_clf.predict_proba, num_features=len(feature_names))

In [None]:
#visualize the explanation
explanation.as_pyplot_figure()

In [None]:
explanation.as_list()

In [None]:
explanation.as_map()

# AutoML-pycaret

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
data = pd.read_csv("/kaggle/input/pragyanaiprojectthon-payment-fraud-data/payment_fraud.csv")

In [None]:
clf1 = setup(data = data,
             target = 'label',
             numeric_imputation = 'mean', 
             categorical_features = ['paymentMethod'] )

In [None]:
compare_models()