# Fraud Detection Notebook using Machine Learning

In [None]:
!pip install pandas_profiling

## install Light Gradient Boosted Machine. It is based on decision tree algorithms and used for ranking, classification and other machine learning tasks

In [None]:
!pip install lightgbm

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import ipaddress
import pandas_profiling as pp
%matplotlib inline
from sklearn import preprocessing
plt.rc("font", size=14)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import warnings
warnings.filterwarnings("ignore")
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMClassifier
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
import types
import pandas as pd

url = 'https://raw.githubusercontent.com/IBM/predict-fraud-using-auto-ai/master/data/fraud_dataset.csv'
df = pd.read_csv(url)

print(df.head())
print(df.shape)

In [None]:
count_fraud = len(df[df['Fraud_Risk']==0])
count_non_fraud = len(df[df['Fraud_Risk']==1])
pct_of_non_fraud = count_non_fraud/(count_non_fraud +count_fraud)
print("percentage of non Fraud Risk is", round(pct_of_non_fraud*100,2))
pct_of_fraud = count_fraud/(count_non_fraud +count_fraud)
print("percentage of Fraud Risk", round(pct_of_fraud*100,2))

In [None]:
sns.countplot(x='Fraud_Risk',data=df, palette='hls')
plt.show()

In [None]:
df.groupby('Fraud_Risk').mean()

In [None]:
df.corr(method ='pearson')

In [None]:
## Select input and target variables

In [None]:
X = df[df.columns[0:12]]
y = df[df.columns[12:]]

In [None]:
df.dtypes

In [None]:
df.isna()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Splitting the data with 70% as training set and 30% as test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print("Train_x Shape :: ", X_train.shape)
print("Train_y Shape :: ", y_train.shape)
print("Test_x Shape :: ", X_test.shape)
print("Test_y Shape :: ", y_test.shape)

In [None]:
d_train = lgb.Dataset(X_train, label=y_train)

In [None]:
def LGBM_classifier(features, target):
    """
    To train the LGBM classifier with features and target data
    :param features:
    :param target:
    :return: trained LGBM classifier
    """
    model = LGBMClassifier(metric='binary_logloss', objective='binary')
    model.fit(features, target)
    return model

start = time.time()
trained_model = LGBM_classifier(X_train, y_train.values.ravel())
print("> Completion Time : ", time.time() - start)
print("Trained LGBM model :: ", trained_model)
predictions = trained_model.predict(X_test)

## Select input and target variables

In [None]:
print("Train Accuracy :: ", accuracy_score(y_train, trained_model.predict(X_train)))
print("LGBM Model Test Accuracy is :: ", accuracy_score(y_test, predictions))

In [None]:
print(" Confusion matrix ", confusion_matrix(y_test, predictions))

In [None]:
feat_imp = pd.Series(trained_model.feature_importances_, index=X.columns)
feat_imp.nlargest(12).plot(kind='barh', figsize=(8,10))

## SHAP (SHapley Additive exPlanations) is a unified approach to explain the output of any machine learning model.

In [None]:
!pip install shap

In [None]:
import shap
shap.initjs()

In [None]:
shap_values = shap.TreeExplainer(trained_model.booster_).shap_values(X_train)

In [None]:
shap.summary_plot(shap_values, X_train)

### In this notebook, we have done exploratory data analysis, feature importance, model building & interpreting the model attributes for improving the accuracy. This is to demonstrate some of the mandatory steps which are involved in building predictive models which are done manually. These steps can take days to complete. All these steps are automated in AutoAI feature in Watson Studio where we can create and deploy machine learning models in minutes.