In [1]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning models libs
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
# loading dataset
dataframe = pd.read_csv("Fraud.csv")
df =dataframe.copy()

In [3]:
# To begin with, we have step col in dataset, which elicits the time at which transaction is done. 
# Therefore, it is curtial to convert it into convinient format.
# here, i am going to convert each steps into timeDelta format, and soon i will transform it into other format.

df["step"] = pd.to_timedelta(df["step"], unit='h')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,0 days 01:00:00,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,0 days 01:00:00,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,0 days 01:00:00,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,0 days 01:00:00,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,0 days 01:00:00,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:


# if there exists any null values, i am removing those values as we have large dataset which is more then enogh to train ML model.
df.dropna()

# converting categorical variable into numertical with the help of one hot encoding. 
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), [1])
    ],
    remainder='passthrough'
)

df_encoded = ct.fit_transform(df)
df_encoded = pd.DataFrame(df_encoded, columns=ct.get_feature_names_out())

In [6]:
df_encoded.head()

Unnamed: 0,encoder__type_CASH_IN,encoder__type_CASH_OUT,encoder__type_DEBIT,encoder__type_PAYMENT,encoder__type_TRANSFER,remainder__step,remainder__amount,remainder__nameOrig,remainder__oldbalanceOrg,remainder__newbalanceOrig,remainder__nameDest,remainder__oldbalanceDest,remainder__newbalanceDest,remainder__isFraud,remainder__isFlaggedFraud
0,0.0,0.0,0.0,1.0,0.0,0 days 01:00:00,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,0.0,0.0,0.0,1.0,0.0,0 days 01:00:00,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,0.0,0.0,0.0,0.0,1.0,0 days 01:00:00,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,0.0,1.0,0.0,0.0,0.0,0 days 01:00:00,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,0.0,0.0,0.0,1.0,0.0,0 days 01:00:00,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [7]:

df_encoded['customer_start_freq'] = df_encoded.groupby('remainder__nameOrig')['remainder__nameOrig'].transform('count')
df_encoded['customer_recipient_freq'] = df_encoded.groupby('remainder__nameDest')['remainder__nameDest'].transform('count')
df_encoded = df_encoded.drop('remainder__nameDest', axis=1)
df_encoded = df_encoded.drop('remainder__nameOrig', axis=1)

# we had a step column in formant of time delta, 
# but hear i have converted it into the three different columns, 
# since these coulumn helps the machine learnig model to get better idea of fraudant transcation. 

df_encoded['days'] = df_encoded['remainder__step'].dt.days
df_encoded['hours'] = df_encoded['remainder__step'].dt.components.hours
df_encoded['minutes'] = df_encoded['remainder__step'].dt.components.minutes
df_encoded  = df_encoded.drop("remainder__step", axis=1)
col_to_move = df_encoded.pop('remainder__isFraud')
df_encoded.insert(len(df_encoded.columns), 'remainder__isFraud', col_to_move)


In [25]:
df_encoded.head()

Unnamed: 0,encoder__type_CASH_IN,encoder__type_CASH_OUT,encoder__type_DEBIT,encoder__type_PAYMENT,encoder__type_TRANSFER,remainder__amount,remainder__oldbalanceOrg,remainder__newbalanceOrig,remainder__oldbalanceDest,remainder__newbalanceDest,remainder__isFlaggedFraud,customer_start_freq,customer_recipient_freq,days,hours,minutes,remainder__isFraud
0,0.0,0.0,0.0,1.0,0.0,9839.64,170136.0,160296.36,0.0,0.0,0,1,1,0,1,0,0
1,0.0,0.0,0.0,1.0,0.0,1864.28,21249.0,19384.72,0.0,0.0,0,1,1,0,1,0,0
2,0.0,0.0,0.0,0.0,1.0,181.0,181.0,0.0,0.0,0.0,0,1,44,0,1,0,1
3,0.0,1.0,0.0,0.0,0.0,181.0,181.0,0.0,21182.0,0.0,0,1,41,0,1,0,1
4,0.0,0.0,0.0,1.0,0.0,11668.14,41554.0,29885.86,0.0,0.0,0,1,1,0,1,0,0


In [8]:
#separating labels and non labled data
X = df_encoded.iloc[:, :-1].values
Y= df_encoded.iloc[:, -1].values

# spliting data set into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [9]:
# Scaling the data
scaler = StandardScaler()
X_train[:, 5:10] = scaler.fit_transform(X_train[:, 5:10])
X_test[:, 5:10] = scaler.transform(X_test[:, 5:10])


# Furthermore, ANN and other machine learning model accepts the either float32 ot float64 datatype
# and data type of our data is "float"
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)
y_train = y_train.astype(np.float64)
y_test = y_test.astype(np.float64)

## Random Forest

In [10]:

classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [20]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


y_pred = classifier.predict(X_test)

# AUC-ROC
auc_roc = roc_auc_score(y_test, y_pred)
print(f'AUC-ROC: {auc_roc:.2f}')
print()

# accuracy and confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
print()

# f-1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 score: {f1:.2f}')
print()

# k fold cross validation
accuracies =  cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation {:.2f} %".format(accuracies.std()*100))
print()



AUC-ROC: 0.91

[[1270870      13]
 [    305    1336]]
F1 score: 0.89

Accuracy 99.97 %
Standard Deviation 0.00 %



#### Relevances of each features

In [27]:
importances = classifier.feature_importances_

# Print the feature importances
for feature, importance in zip(pd.DataFrame(X).columns, importances):
    print(f'{feature}: {importance:.2f}')

print(df_encoded.columns)

0: 0.01
1: 0.05
2: 0.00
3: 0.02
4: 0.07
5: 0.14
6: 0.23
7: 0.06
8: 0.05
9: 0.19
10: 0.00
11: 0.00
12: 0.02
13: 0.08
14: 0.08
15: 0.00
Index(['encoder__type_CASH_IN', 'encoder__type_CASH_OUT',
       'encoder__type_DEBIT', 'encoder__type_PAYMENT',
       'encoder__type_TRANSFER', 'remainder__amount',
       'remainder__oldbalanceOrg', 'remainder__newbalanceOrig',
       'remainder__oldbalanceDest', 'remainder__newbalanceDest',
       'remainder__isFlaggedFraud', 'customer_start_freq',
       'customer_recipient_freq', 'days', 'hours', 'minutes',
       'remainder__isFraud'],
      dtype='object')


 As per the observation, we can conclude that Oldbalanced has played major role in training. Followed by this, newbalanceDest has the second hightest relevance.
However, dose this really make sence ?
First 5 features, representing type of trasancation is done, do plays a part in predicting the fraudant transcation. 
Beside this, feature 6 to 10 feature are the most important one as they show money in amount, and if we were to choose some of the most important features, we would definitely choose the columns which demonstrats the amount of funds trasnfered or debited or credited. SO, in conclusion, machine learning model has came to genaralisation very well as i thought so.



In [36]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.177169


In [50]:
new_preds  = (preds>0.5)
new_preds   = new_preds.astype(int)


67

In [60]:
accuracy = accuracy_score(y_test, new_preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print()


f1 = f1_score(y_test, new_preds)
print("f1: ", f1)
print()


# Calculate AUC ROC
auc_roc = roc_auc_score(y_test, new_preds)
print("AUC ROC: ",auc_roc)

Accuracy: 99.88%

f1:  0.07845433255269321

AUC ROC:  0.5204143814747105


## Gradient Boosting

In [62]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                 max_depth=1, random_state=0, loss='squared_error')
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Mean squared error: 0.00


In [67]:
y_pred= (y_pred >0.5)
y_pred = y_pred.astype(int)

In [70]:

from sklearn.metrics import precision_score, recall_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print()


f1 = f1_score(y_test, y_pred)
print("f1: ", f1)
print()


# Calculate AUC ROC
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC ROC: ",auc_roc)
print()


precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Precision = {precision}")
print(f"Recall = {recall}")

Accuracy: 99.88%

f1:  0.07845433255269321

AUC ROC:  0.5204143814747105

Precision = 1.0
Recall = 0.04082876294942109
