In [None]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import f_regression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.model_selection import learning_curve, GridSearchCV

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

cf.go_offline()
sns.set()

In [None]:
# Loading the dataset
df = pd.read_csv('../input/is-it-fraud/isfraud.csv')
df.head()

In [None]:
df.info()

In [None]:
# let's see how much data is missing, the percentage of missing data for each column is a good estimate, say if a column is missing a big percentage, say 80%, probably using it might not be a wise idea.
100 * (df.isnull().sum()) / len(df)

# 1- Defining and balancing the label

In [None]:
df['isFraud'].value_counts()

In [None]:
# let's label the target column, simply target
df['target'] = df['isFraud'].copy()
df = df.drop('isFraud', axis = 1)
df.head()

In [None]:
# is the label balanced? a dataset with balanced labels is ideal, where 0 and 1 values of target are almost equal in frequency
sns.countplot(x = 'target', data = df)

In [None]:
# balance the input for target, if only there are to values 0 and 1 as in val1 and val2

# first shuffle indices:
df = df.sample(frac=1).reset_index(drop=True)

# then set val1 and val2 below:
val1 = 0
val2 = 1

val_1_bigger = 0
val_2_bigger = 0

# find the number of each label
num_val1 = df[df['target'] == val1].shape[0]
num_val2 = df[df['target'] == val2].shape[0]

if num_val1 > num_val2:
    val_1_bigger = 1
else:
    val_2_bigger = 1

i_1_count = 0
i_2_count = 0
indices_to_remove = []

for i in np.arange(len(df)):
    
    if df['target'][i] == 0:
        i_1_count = i_1_count + 1
        if (val_1_bigger == 1) and (i_1_count > num_val2):
            indices_to_remove.append(i)
    if df['target'][i] == 1:
        i_2_count = i_2_count + 1
        if (val_2_bigger == 1) and (i_2_count > num_val1):
            indices_to_remove.append(i)

            
df = df.drop(index=indices_to_remove, axis = 0)
df = df.reset_index(drop = True)

In [None]:
# is the label balanced, now?
sns.countplot(x = 'target', data = df)

# no but reall ywe cannot do that much about it

# 2- droping clearly useless features

In [None]:
len(df)

In [None]:
# first the categorical ones
for col in df.select_dtypes(['object']):
    print(col)

In [None]:
# below codes compare the value counts in each categorical dataset to dataset length. The idea is that a dataset with a column with too many
# distinct values is not useful for modeling.

col_list = []
factor = []
nu = []

for col in df.select_dtypes(['object']):
    col_list.append(col)
    factor.append(100 * df[col].nunique() / len(df))
    nu.append(df[col].nunique())

col_list = np.array(col_list).T
factor = np.array(factor).T
nu = np.array(nu).T

factor_df = pd.DataFrame(data = col_list, columns = ['Column'])
factor_df['Factor'] = factor
factor_df['nu'] = nu
factor_df

In [None]:
# here, "nameOrig" and "nameDest" are not helpful in bulding any model

In [None]:
# so clearly nameOrig, and nameDest must be deleted
df = df.drop(['nameDest', 'nameOrig'], axis = 1)

# 3- Which features are important for this target?

## 3-a) for number features, we examin corr and p_value. if corr < some% and value > 0.05 with the label, we will drop it

In [None]:
np.abs(df.corr()['target']).sort_values(ascending = True)[:-1].plot.bar(figsize = (16,8))

In [None]:
# lets drop newbalanceDest and isFlaggedFraud, looks to be note that much linearly correlated

In [None]:
# now look at p_values
col_list = []
p_list = []
for col in df.select_dtypes(['number']):
    slope, intercept, r_value, p_value, std_err = stats.linregress(df[col], df['target'])
    col_list.append(col)
    p_list.append(p_value)
    #print(f'{col} is associated with the target wtih p_value of:    {p_value}')

pval_table = pd.DataFrame(data = col_list, columns = ['col'])
pval_table['p_values'] = p_list
pval_table.sort_values(by = 'p_values', ascending = False)

# same as corr results, so we need to be dropped two columns,

In [None]:
df = df.drop(['newbalanceDest', 'isFlaggedFraud'], axis = 1)

In [None]:
df.head()

## 3-b) Now let's look at target dependance on categorical

In [None]:
for col in df.select_dtypes(['object']):
    print(col)

In [None]:
df.shape

In [None]:
# now we try to distplot each column with hue of target

for col in df.select_dtypes(['object']):
    plt.figure(figsize = (16,6))
    sns.countplot(df[col], hue = df['target'])
    plt.show()

In [None]:
# based on the above, we can see that, type is correlated with our target

# 4- looking into each categorical column

In [None]:
# lets look into each object column
for col in df.select_dtypes(['object']):
    print()
    print('for the feature:     ', col)
    print(df[col].value_counts())

In [None]:
# no need to drop anything here, the distinct values are almost properly distributed, no outlier ... . Probably one can drop DEBIT values 
# as we dont have a lot of them. but for now, lets keep it

# 5- Dealing with missing values

In [None]:
(100 * (df.isnull().sum()) / len(df)).sort_values(ascending = False)
# here there is no missing value, but feel free to look at below cells for suh a senario.
# if a column has many missing values, one might drop it.
# one possible senario is to use fillna and use, say average of other values in the column
# the other case, is to find what column is correlated with the column with missing value, and try to use this reference column to predict
# what value we can kinda safely assign to the missing column

In [None]:
# let's assume "amount" column has missing values:
np.abs(df.corr()['amount']).sort_values()[:-1].plot.bar(figsize = (16, 8))
# looks more correlations with OldBalanceOrg

In [None]:
plt.figure(figsize = (16,8))
sns.distplot(df['oldbalanceOrg'].dropna(), bins = 50)
# we use this graph to make bins for the next cell

In [None]:
# the column with missing value is called "my_missing" and the reference column used to fill in the missing data is "my_ref"
# here, based on the reference column value distirbution, we better define some bins. these bins of the reference are used to calculate
# the mean for the missing column. then for a missing value, we look into the value of its associated reference column bins. There is one mean
# value associated with each bin in the reference. Choosing bin needs to be smart and cannot be conveniently automated.
# one should keep a balance between number of bins and length of dataset.
# again, there is no missing value so we dont run this cell.

my_ref= 'oldbalanceOrg'
my_missing = 'amount'

bins = [0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0, 1.5, 2.5, 7.0]
#my_min_val = bins[:-1]
#my_max_val = bins[1:]

#df_x = pd.DataFrame(df.dropna())
#df_x = df_x.groupby(pd.cut(df_x[my_ref], bins))[my_missing].mean().to_frame()
#df_x.reset_index(inplace = True)
#df_x['my_min'] = my_min_val
#df_x['my_max'] = my_max_val
#df_x.drop(my_ref, axis = 1, inplace = True)

#def fill_corr(ref, missing):
#    if np.isnan(missing):
#        for i in range(len(df_x)):
#            if ref >= df_x.iloc[i,1] and ref < df_x.iloc[i,2]:
#                return df_x.iloc[i, 0]
#    else:
#        return missing
#
#df[my_missing] = df.apply(lambda x: fill_corr(x[my_ref], x[my_missing]), axis = 1)

# 6- Getting rid of outliers

In [None]:
for col in df.select_dtypes(['number']):
    print(col)

In [None]:
for col in df.select_dtypes(['number']):
    plt.figure(figsize = (16,6))
    sns.distplot(df[col], bins = 100, kde = False)
    plt.show()

In [None]:
# in this case, I dont any outliers to be a problem that much. one can go and try to cut it at some value and see the results

In [None]:
# get rid of outliers. I use below code for doing so. I dont use quantile and prefer to look at data before ignoring it.
# here we set up and down_temp values for the specified column

#col = 'oldbalanceOrg'
#up_temp = 20000000
#down_temp = 0
#plt.figure(figsize = (16,6))
#sns.distplot(df[col], kde = False)
#plt.show()
#df = df[(df[col] < up_temp) & (df[col] > down_temp)]
#print(df.shape[0])
#plt.figure(figsize = (10,6))
#sns.distplot(df[col], bins = 100)
#plt.show()

# 7 look at it

In [None]:
# just looking into how each column is relating to the target column

In [None]:
for col in df.select_dtypes(['number']):
    plt.figure(figsize = (16,6))
    sns.scatterplot(data = df, x = col, y = 'target', hue = 'target')
    plt.show()

In [None]:
for col in df.select_dtypes(['object']):
    plt.figure(figsize = (16,6))
    sns.countplot(df[col], hue = df['target'])
    plt.show()

# 8- get dummies

In [None]:
for col in df.select_dtypes(['object']):
    print(col)

In [None]:
for col in df.select_dtypes(['object']):
    dummies = pd.get_dummies(df[col], drop_first = True, prefix = col)
    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(col, axis = 1)

In [None]:
df.head()

# 9- Train-Test split

In [None]:
# setting up x and y, the .values make it a numpy array to put into tf

x = df.drop('target', axis = 1)
y = df['target']

In [None]:
# split, first into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 101)

# then split the train one into test and valid, 0.1111 x 0.9 = 0.09999
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.11111111, random_state = 101)

In [None]:
x_train.shape, x_val.shape, x_test.shape

# 10- Scaling

In [None]:
## scaling must happen after test_train split to avoid data leakage and we dont "fit" the validation and test sets

scalar = StandardScaler()

x_train = scalar.fit_transform(x_train)

x_val = scalar.transform(x_val)

x_test = scalar.transform(x_test)

# 11- Modeling, we use different methods

# <center>SVC

In [None]:
# we gotta use a grid search to find best parameters
param_grid = {'C' : [ 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'gamma' : [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]}
grid = GridSearchCV(SVC(), param_grid, verbose = 2)
grid.fit(x_train, y_train.ravel())

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid.best_score_

In [None]:
# now we re-run it:
# predictions

predictions = pd.DataFrame(grid.predict(x_test), columns = ['Predicted Values'])
predictions['Real Values'] = y_test.reset_index(drop = True)
predictions.head(10)

In [None]:
# residual making

predictions['Residual'] = predictions['Real Values'] - predictions['Predicted Values']
sns.countplot(predictions['Residual'])

In [None]:
print(classification_report(y_test, predictions['Predicted Values']))

In [None]:
my_cm = confusion_matrix(y_test, predictions['Predicted Values'])
conf_temp = {'Predicted NO': [my_cm[0][0], my_cm[1][0]], 'Predicted YES': [my_cm[0][1], my_cm[1][1]] }
my_cmdf = pd.DataFrame(conf_temp, index = ['Actual NO', 'Actual YES'])
my_cmdf

In [None]:
#d = classification_report(y_test, predictions['Predicted Values'], output_dict=True)
#correct_percentage_svc = d['accuracy'] * 100

In [None]:
correct_percentage_svc = 100 * (my_cm[0][0] + my_cm[1][1]) / predictions.shape[0]
wrong_percentage = 100 - correct_percentage_svc
print(f'Model Accuracy is: { correct_percentage_svc:.4}%' )

# <center> Logistic

In [None]:
# modeling and priting off coefs:

lm = LogisticRegression(max_iter = 1000)
lm.fit(x_train, y_train.ravel())

# setting coeffs
my_coef = lm.coef_

# setting intercept
my_intercept = lm.intercept_

print(my_coef)
print('')
print(my_intercept)

In [None]:
# predictions

predictions = pd.DataFrame(lm.predict(x_test), columns = ['Predicted Values'])
predictions ['Real Values'] = y_test.reset_index(drop = True)
predictions.head(10)

In [None]:
# making a data frame for real vs predicted vs residuals
predictions ['Residuals'] = predictions ['Real Values'] - predictions['Predicted Values']
sns.countplot(predictions['Residuals'])

In [None]:
print(classification_report(y_test, predictions['Predicted Values']))

In [None]:
my_cm = confusion_matrix(y_test, predictions['Predicted Values'])
conf_temp = {'Predicted NO': [my_cm[0][0], my_cm[1][0]], 'Predicted YES': [my_cm[0][1], my_cm[1][1]] }
my_cmdf = pd.DataFrame(conf_temp, index = ['Actual NO', 'Actual YES'])
my_cmdf

In [None]:
correct_percentage_log = 100 * (my_cm[0][0] + my_cm[1][1]) / predictions.shape[0]
wrong_percentage = 100 - correct_percentage_log
print(f'Model Accuracy is: {correct_percentage_log:.4}%' )

# <center> Dicision Trees

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(x_train, y_train)

In [None]:
# predictions

predictions = pd.DataFrame(dtree.predict(x_test), columns = ['Predicted Values'])
predictions ['Real Values'] = y_test.reset_index(drop = True)
predictions.head(10)

In [None]:
# residual making

predictions['Residual'] = predictions['Real Values'] - predictions['Predicted Values']
sns.countplot(predictions['Residual'])

In [None]:
print(classification_report(y_test, predictions['Predicted Values']))

In [None]:
my_cm = confusion_matrix(y_test, predictions['Predicted Values'])
conf_temp = {'Predicted NO': [my_cm[0][0], my_cm[1][0]], 'Predicted YES': [my_cm[0][1], my_cm[1][1]] }
my_cmdf = pd.DataFrame(conf_temp, index = ['Actual NO', 'Actual YES'])
my_cmdf

In [None]:
correct_percentage_dt = 100 * (my_cm[0][0] + my_cm[1][1]) / predictions.shape[0]
wrong_percentage = 100 - correct_percentage_dt
print(f'Model Accuracy is: {correct_percentage_dt:.4}%' )

# <center> Random Forests

In [None]:
rf = RandomForestClassifier(n_estimators = 1000)
rf.fit(x_train, y_train.ravel())

In [None]:
# predictions

predictionsrf = pd.DataFrame(rf.predict(x_test), columns = ['Predicted Values'])
predictionsrf ['Real Values'] = y_test.reset_index(drop = True)
predictionsrf.head(10)

In [None]:
# residual making

predictions['Residual'] = predictions['Real Values'] - predictions['Predicted Values']
sns.countplot(predictions['Residual'])

In [None]:
print(classification_report(y_test, predictionsrf['Predicted Values']))

In [None]:
my_cm = confusion_matrix(y_test, predictionsrf['Predicted Values'])
conf_temp = {'Predicted NO': [my_cm[0][0], my_cm[1][0]], 'Predicted YES': [my_cm[0][1], my_cm[1][1]] }
my_cmdf = pd.DataFrame(conf_temp, index = ['Actual NO', 'Actual YES'])
my_cmdf

In [None]:
correct_percentage_rf = 100 * (my_cm[0][0] + my_cm[1][1]) / predictionsrf.shape[0]
wrong_percentage = 100 - correct_percentage_rf
print(f'Model Accuracy is: {correct_percentage_rf:.4}%' )

# <center> ANN

In [None]:
x = df.drop('target', axis = 1).values
y = df['target'].values

# split, first into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 101)

# then split the train one into test and valid
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.11111111, random_state = 101)

## scaling must happen after test_train split to avoid data leakage

scalar = StandardScaler()

x_train = scalar.fit_transform(x_train)

x_val = scalar.transform(x_val)

x_test = scalar.transform(x_test)

In [None]:
# making the layers

model = Sequential()
nnodes = 250
act_func = 'relu'

model.add(Dense(nnodes, activation = act_func))

model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))
model.add(Dense(nnodes, activation = act_func))


model.add(Dense(1, activation = 'sigmoid'))

In [None]:
# compiling the model

model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

In [None]:
# fitting the model

early_stop = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 25, verbose = 1)

model.fit(x = x_train, y = y_train, validation_data = (x_val, y_val),
          callbacks = [early_stop],
          batch_size = 256,
          epochs = 500, 
          verbose = 2)

In [None]:
loss_func = pd.DataFrame(data = model.history.history['loss'], columns = ['Model Loss'])
loss_func['Validation Loss'] =  model.history.history['val_loss']
loss_func

In [None]:
loss_func.plot(figsize = (15,6))

In [None]:
# evaluate against a test set
print(model.evaluate(x_test, y_test, verbose = 0))
print(model.evaluate(x_train, y_train, verbose = 0))
print(model.evaluate(x_val, y_val, verbose = 0))

In [None]:
predictions = pd.DataFrame(data = model.predict_classes(x_test), columns = ['Predicted Values'])
predictions['Real Values'] = y_test
predictions.head()

In [None]:
# residual making

predictions['Residual'] = predictions['Real Values'] - predictions['Predicted Values']
sns.countplot(predictions['Residual'])

In [None]:
# accuracy predictor
print(classification_report(predictions['Real Values'], predictions['Predicted Values']))

In [None]:
# confustion matrix
Class_0 = '0'
Class_1 = '1'

my_cm = confusion_matrix(y_test, predictions['Predicted Values'])
conf_temp = {f'Predicted {Class_0}': [my_cm[0][0], my_cm[1][0]], f'Predicted {Class_1}': [my_cm[0][1], my_cm[1][1]] }
my_cmdf = pd.DataFrame(conf_temp, index = [f'Actual {Class_0}', f'Actual {Class_1}'])
my_cmdf

In [None]:
correct_percentage_ann = 100 * (my_cm[0][0] + my_cm[1][1]) / predictions.shape[0]
wrong_percentage_ann = 100 - correct_percentage_ann
print(f'Model Accuracy is: {correct_percentage_ann:.4}%' )

# 12- Models' Comparison

In [None]:
ind = ['SVC', 'Logistics', 'Dicision Tree', 'Random Forests', 'ANN']
values = [correct_percentage_svc, correct_percentage_log, correct_percentage_dt, correct_percentage_rf, correct_percentage_ann]

df = pd.DataFrame(data = values, index = ind, columns = ['%Accuracy']).sort_values(by = '%Accuracy', ascending = False).round(2)
df