# Fraud Detection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [None]:
fraud_data = pd.read_csv('datasets/fraudTrain.csv')
del fraud_data['Unnamed: 0']

In [None]:
fraud_data.info()

# tag::Fin_ML_08_02[]

In [None]:
plt.pie(fraud_data['is_fraud'].value_counts(), labels=[0, 1])
plt.title('Pie Chart for Dependent Variable');
print(fraud_data['is_fraud'].value_counts())
plt.show()

In [None]:
import missingno as msno

msno.bar(fraud_data)


In [None]:
fraud_data['time'] = pd.to_datetime(fraud_data['trans_date_trans_time'])
del fraud_data['trans_date_trans_time']

In [None]:
fraud_data['days'] = fraud_data['time'].dt.day_name()
fraud_data['hour'] = fraud_data['time'].dt.hour

In [None]:
def fraud_cat(cols):
    k = 1
    plt.figure(figsize=(20, 40))
    for i in cols:
        categ = fraud_data.loc[fraud_data['is_fraud'] == 1, i].\
                value_counts().sort_values(ascending=False).\
                reset_index().head(10)#<1>
        plt.subplot(len(cols) / 2, len(cols) / 2, k)
        bar_plot = plt.bar(categ.iloc[:, 0], categ[i])
        plt.title(f'Cases per {i} Categories')
        plt.xticks(rotation='45')
        k+= 1
    return categ, bar_plot 

In [None]:
cols = ['job', 'state', 'gender', 'category', 'days', 'hour']
_, bar_plot = fraud_cat(cols)
bar_plot

In [None]:
cols=['amt','gender','state','category',
      'city_pop','job','is_fraud','days','hour']
fraud_data_df=fraud_data[cols]

In [None]:
cat_cols=fraud_data[cols].select_dtypes(include='object').columns

In [None]:
def one_hot_encoded_cat(data, cat_cols):
    for i in cat_cols:
        df1 = pd.get_dummies(data[str(i)], 
                             prefix=i, drop_first=True)
        data.drop(str(i), axis=1, inplace=True)
        data = pd.concat([data, df1], axis=1)
    return data

In [None]:
fraud_df = one_hot_encoded_cat(fraud_data_df, cat_cols)

In [None]:
num_col = fraud_data_df.select_dtypes(exclude='object').columns
fraud_data_df = fraud_data_df[num_col]
del fraud_data_df['is_fraud']

In [None]:
plt.figure(figsize=(10,6))
corrmat = fraud_data_df.corr()
top_corr_features = corrmat.index
heat_map = sns.heatmap(corrmat, annot=True, cmap="viridis")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (classification_report,
                            confusion_matrix, f1_score)

In [None]:
non_fraud_class = fraud_df[fraud_df['is_fraud'] == 0]
fraud_class = fraud_df[fraud_df['is_fraud'] == 1]

In [None]:
non_fraud_count,fraud_count=fraud_df['is_fraud'].value_counts()
print('The number of observations in non_fraud_class:', non_fraud_count)
print('The number of observations in fraud_class:', fraud_count)

In [None]:
non_fraud_under = non_fraud_class.sample(fraud_count)
under_sampled = pd.concat([non_fraud_under, fraud_class], axis=0)
X_under = under_sampled.drop('is_fraud',axis=1)
y_under = under_sampled['is_fraud']

In [None]:
X_train_under, X_test_under, y_train_under, y_test_under =\
        train_test_split(X_under, y_under, random_state=0)

In [None]:
param_log = {'C': np.logspace(-4, 4, 4), 'penalty': ['l1', 'l2']}
log_grid = GridSearchCV(LogisticRegression(),
                        param_grid=param_log, n_jobs=-1)
log_grid.fit(X_train_under, y_train_under)
prediction_log = log_grid.predict(X_test_under)

In [None]:
conf_mat_log = confusion_matrix(y_true=y_test_under,
                                y_pred=prediction_log)
print('Confusion matrix:\n', conf_mat_log)
print('--' * 25)
print('Classification report:\n',
      classification_report(y_test_under, prediction_log))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
param_dt = {'max_depth': [3, 5, 10],
            'min_samples_split': [2, 4, 6],
            'criterion': ['gini', 'entropy']}
dt_grid = GridSearchCV(DecisionTreeClassifier(),
                       param_grid=param_dt, n_jobs=-1)
dt_grid.fit(X_train_under, y_train_under)
prediction_dt = dt_grid.predict(X_test_under)

In [None]:
conf_mat_dt = confusion_matrix(y_true=y_test_under,
                               y_pred=prediction_dt)
print('Confusion matrix:\n', conf_mat_dt)
print('--' * 25)
print('Classification report:\n',
      classification_report(y_test_under, prediction_dt))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_rf = {'n_estimators':[20,50,100] ,
         'max_depth':[3,5,10],
         'min_samples_split':[2,4,6],
         'max_features':['auto', 'sqrt', 'log2']}  
rf_grid = GridSearchCV(RandomForestClassifier(),
                      param_grid=param_rf, n_jobs=-1)
rf_grid.fit(X_train_under, y_train_under)
prediction_rf = rf_grid.predict(X_test_under)

In [None]:
conf_mat_rf = confusion_matrix(y_true=y_test_under,
                               y_pred=prediction_rf)
print('Confusion matrix:\n', conf_mat_rf)
print('--' * 25)
print('Classification report:\n', 
      classification_report(y_test_under, prediction_rf))

In [None]:
from xgboost import XGBClassifier

In [None]:
param_boost = {'learning_rate': [0.01, 0.1],
               'max_depth': [3, 5, 7],
               'subsample': [0.5, 0.7],
               'colsample_bytree': [0.5, 0.7],
               'n_estimators': [10, 20, 30]}
boost_grid = RandomizedSearchCV(XGBClassifier(),
                                param_boost, n_jobs=-1)
boost_grid.fit(X_train_under, y_train_under)
prediction_boost = boost_grid.predict(X_test_under)


In [None]:
conf_mat_boost = confusion_matrix(y_true=y_test_under,
                                  y_pred=prediction_boost)
print('Confusion matrix:\n', conf_mat_boost)
print('--' * 25)
print('Classification report:\n', 
      classification_report(y_test_under, prediction_boost))

## Cost-Based Fraud Examination

In [None]:
fraud_df_sampled = fraud_df.sample(int(len(fraud_df) * 0.2))

In [None]:
cost_fp = 2
cost_fn = fraud_df_sampled['amt']
cost_tp = 2
cost_tn = 0
cost_mat = np.array([cost_fp * np.ones(fraud_df_sampled.shape[0]),
                     cost_fn,
                     cost_tp * np.ones(fraud_df_sampled.shape[0]),
                     cost_tn * np.ones(fraud_df_sampled.shape[0])]).T


In [None]:
cost_log = conf_mat_log[0][1] * cost_fp + conf_mat_boost[1][0] * \
            cost_fn.mean() + conf_mat_log[1][1] * cost_tp
cost_dt = conf_mat_dt[0][1] * cost_fp + conf_mat_boost[1][0] * \
          cost_fn.mean() + conf_mat_dt[1][1] * cost_tp
cost_rf = conf_mat_rf[0][1] * cost_fp + conf_mat_boost[1][0] * \
          cost_fn.mean() + conf_mat_rf[1][1] * cost_tp
cost_boost = conf_mat_boost[0][1] * cost_fp + conf_mat_boost[1][0] * \
             cost_fn.mean() + conf_mat_boost[1][1] * cost_tp

## Saving Scores for Different ML Algorithms

In [None]:
#!pip install scikit-learn==0.22 

In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from costcla.metrics import cost_loss, savings_score
from costcla.models import BayesMinimumRiskClassifier

In [None]:
X_train, X_test, y_train, y_test, cost_mat_train, cost_mat_test = \
train_test_split(fraud_df_sampled.drop('is_fraud', axis=1),
                           fraud_df_sampled.is_fraud, cost_mat,
                           test_size=0.2, random_state=0)

In [None]:
saving_models = []
saving_models.append(('Log. Reg.', 
                      LogisticRegression()))
saving_models.append(('Dec. Tree', 
                      DecisionTreeClassifier()))
saving_models.append(('Random Forest', 
                      RandomForestClassifier()))


In [None]:
saving_score_base_all = []

for name, save_model in saving_models:
    sv_model = save_model
    sv_model.fit(X_train, y_train)
    y_pred = sv_model.predict(X_test)
    saving_score_base = savings_score(y_test, y_pred, cost_mat_test)
    saving_score_base_all.append(saving_score_base)
    print('The saving score for {} is {:.4f}'. 
          format(name, saving_score_base))
    print('--' * 20)

In [None]:
f1_score_base_all = []

for name, save_model in saving_models:
    sv_model = save_model
    sv_model.fit(X_train, y_train)
    y_pred = sv_model.predict(X_test)
    f1_score_base = f1_score(y_test, y_pred, cost_mat_test)
    f1_score_base_all.append(f1_score_base)
    print('The F1 score for {} is {:.4f}'.
          format(name, f1_score_base))
    print('--' * 20)

## Cost-Sensitive Compare

In [None]:
from costcla.models import CostSensitiveLogisticRegression
from costcla.models import CostSensitiveDecisionTreeClassifier
from costcla.models import CostSensitiveRandomForestClassifier

In [None]:
cost_sen_models = []
cost_sen_models.append(('Log. Reg. CS',
                        CostSensitiveLogisticRegression()))
cost_sen_models.append(('Dec. Tree CS',
                        CostSensitiveDecisionTreeClassifier()))
cost_sen_models.append(('Random Forest CS',
                        CostSensitiveRandomForestClassifier()))

In [None]:
saving_cost_all = []

for name, cost_model in cost_sen_models:
    cs_model = cost_model
    cs_model.fit(np.array(X_train), np.array(y_train),
                 cost_mat_train)
    y_pred = cs_model.predict(np.array(X_test))
    saving_score_cost = savings_score(np.array(y_test),
                                      np.array(y_pred), cost_mat_test)
    saving_cost_all.append(saving_score_cost)
    print('The saving score for {} is {:.4f}'.
          format(name, saving_score_cost))
    print('--'*20)

In [None]:
f1_score_cost_all = []

for name, cost_model in cost_sen_models:
    cs_model = cost_model
    cs_model.fit(np.array(X_train), np.array(y_train),
                 cost_mat_train)
    y_pred = cs_model.predict(np.array(X_test))
    f1_score_cost = f1_score(np.array(y_test),
                             np.array(y_pred), cost_mat_test)
    f1_score_cost_all.append(f1_score_cost)
    print('The F1 score for {} is {:.4f}'. format(name,
                                                  f1_score_cost))
    print('--'*20)

## Bayesian Minimum Risk

In [None]:
saving_score_bmr_all = []

for name, bmr_model in saving_models:
    f = bmr_model.fit(X_train, y_train)
    y_prob_test = f.predict_proba(np.array(X_test))
    f_bmr = BayesMinimumRiskClassifier()
    f_bmr.fit(np.array(y_test), y_prob_test)
    y_pred_test = f_bmr.predict(np.array(y_prob_test),
                                cost_mat_test)
    saving_score_bmr = savings_score(y_test, y_pred_test,
                                     cost_mat_test)
    saving_score_bmr_all.append(saving_score_bmr)
    print('The saving score for {} is {:.4f}'.\
          format(name, saving_score_bmr))
    print('--' * 20)

In [None]:
f1_score_bmr_all = []

for name, bmr_model in saving_models:
    f = bmr_model.fit(X_train, y_train)
    y_prob_test = f.predict_proba(np.array(X_test))
    f_bmr = BayesMinimumRiskClassifier()
    f_bmr.fit(np.array(y_test), y_prob_test)
    y_pred_test = f_bmr.predict(np.array(y_prob_test),
                                cost_mat_test)
    f1_score_bmr = f1_score(y_test, y_pred_test)
    f1_score_bmr_all.append(f1_score_bmr)
    print('The F1 score for {} is {:.4f}'.\
          format(name, f1_score_bmr))
    print('--'*20)

In [None]:
savings = [saving_score_base_all, saving_cost_all, saving_score_bmr_all]
f1 = [f1_score_base_all, f1_score_cost_all, f1_score_bmr_all]
saving_scores = pd.concat([pd.Series(x) for x in savings])
f1_scores = pd.concat([pd.Series(x) for x in f1])
scores = pd.concat([saving_scores, f1_scores], axis=1)
scores.columns = ['saving_scores', 'F1_scores']

In [None]:
model_names = ['Log. Reg_base', 'Dec. Tree_base', 'Random Forest_base',
               'Log. Reg_cs', 'Dec. Tree_cs', 'Random Forest_cs',
              'Log. Reg_bayes', 'Dec. Tree_bayes',
               'Random Forest_bayes']

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(scores.shape[0]), scores["F1_scores"],
         "--", label='F1Score')
plt.bar(np.arange(scores.shape[0]), scores['saving_scores'],
        0.6, label='Savings')
_ = np.arange(len(model_names))
plt.xticks(_, model_names)
plt.legend(loc='best')
plt.xticks(rotation='vertical')
plt.show()

# Unsupervised Learning

In [None]:
from sklearn.preprocessing import StandardScaler
standard = StandardScaler()
scaled_fraud = standard.fit_transform(X_under)

In [None]:
from sklearn_som.som import SOM
som = SOM(m=2, n=1, dim=scaled_fraud.shape[1])
som.fit(scaled_fraud)
predictions_som = som.predict(np.array(scaled_fraud))

In [None]:
predictions_som = np.where(predictions_som == 1, 0, 1)

In [None]:
print('Classification report:\n', 
      classification_report(y_under, predictions_som))

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 6))
x = X_under.iloc[:,0]
y = X_under.iloc[:,1]

ax[0].scatter(x, y, alpha=0.1, cmap='Greys', c=y_under)
ax[0].title.set_text('Actual Classes')
ax[1].scatter(x, y, alpha=0.1, cmap='Greys', c=predictions_som) 
ax[1].title.set_text('SOM Predictions')

## Autoencoder

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout
from keras import regularizers

In [None]:
fraud_df[['amt','city_pop','hour']] = StandardScaler().\
fit_transform(fraud_df[['amt','city_pop','hour']])

In [None]:
X_train, X_test = train_test_split(fraud_df,
                                   test_size=0.2, random_state=123)
X_train[X_train['is_fraud'] == 0]
X_train = X_train.drop(['is_fraud'], axis=1).values
y_test = X_test['is_fraud']
X_test = X_test.drop(['is_fraud'], axis=1).values

In [None]:
autoencoder = keras.Sequential()
autoencoder.add(Dense(X_train_under.shape[1], activation='tanh',
                      activity_regularizer=regularizers.l1(10e-5),
                      input_dim= X_train_under.shape[1]))
#encoder
autoencoder.add(Dense(64, activation='tanh'))
autoencoder.add(Dense(32, activation='relu'))
#decoder
autoencoder.add(Dense(32, activation='elu'))
autoencoder.add(Dense(64,activation='tanh'))
autoencoder.add(Dense(X_train_under.shape[1], activation='elu'))
autoencoder.compile(loss='mse',
                    optimizer='adam')
autoencoder.summary();

In [None]:
batch_size = 200
epochs = 100

In [None]:
history = autoencoder.fit(X_train, X_train,
                          shuffle=True,
                          epochs=epochs,
                          batch_size=batch_size,
                          validation_data=(X_test, X_test),
                          verbose=0).history

In [None]:
autoencoder_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - autoencoder_pred, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})
mse = np.mean(np.power(X_test - autoencoder_pred, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})
error_df.describe()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history['loss'], linewidth=2, label='Train')
plt.plot(history['val_loss'], linewidth=2, label='Test')
plt.legend(loc='upper right')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()