In [None]:
import networkx as nx
import matplotlib.pyplot as plt # graph visualisation
import pandas as pd
import numpy as np

# Credit-Card-2 Network Fraud

In [None]:
edges = pd.read_csv('./credit-card-2/creditcard.csv')

In [None]:
edges.info()

# Synthetic Financial Dataset

In [None]:
nodes = pd.read_csv('./credit-card/Nodes.csv')

In [None]:
nodes.info()

In [None]:
edges = pd.read_csv('./credit-card/transactions.csv')

In [None]:
edges.info()

In [None]:
edges.describe()

In [None]:
fraud = edges['isFraud']
fraud.describe()

In [None]:
nec_edge_inf = pd.DataFrame()
nec_edge_inf['nameOrig'] = edges['nameOrig']
nec_edge_inf['nameDest'] = edges['nameDest']
nec_edge_inf = list(nec_edge_inf.to_records(index=False))

In [None]:
print(nec_edge_inf[0])

### Calculating the percentage of frauds

In [None]:
fraud_number = len(fraud[(fraud > 0)])
total_number = len(edges)
print(total_number,fraud_number,round(fraud_number/total_number,6))


### Creating a dataframe to hold all the metrics

In [None]:
# Each row of the dataset denotes a node of the graph
data = pd.DataFrame(np.zeros((len(nodes.id), 3)))
data.columns = {'Closeness Centrality':[], 'EigenVector Centrality':[], 'PageRank':[]}
data.index = nodes.id
data.info()

## Creating graph G

In [None]:
%%time

G = nx.Graph()
G.add_nodes_from(nodes['id'])
G.add_edges_from(nec_edge_inf)


In [None]:
nx.draw(G)

## Calculating the appropriate metrics of the graph
### Closeness centrality

In [None]:
%%time
tmp = {}
tmp = nx.closeness_centrality(G)

data['Closeness Centrality'] = tmp.values()

### Eigenvector Centrality

In [None]:
%%time

tmp = {}
tmp = nx.eigenvector_centrality(G)

data['EigenVector Centrality'] = tmp.values()

### PageRank

In [None]:
%%time
tmp = {}
tmp = nx.pagerank(G)

data['PageRank'] = tmp.values()

## Or read them from CSV

In [None]:
data = pd.DataFrame()
data['PageRank'] = pd.read_csv('./metrics/credit_card/pagerank.csv',index_col='id')
data['Closeness Centrality'] = pd.read_csv('./metrics/credit_card/closeness.csv',index_col='id')
data['EigenVector Centrality'] = pd.read_csv('./metrics/credit_card/eigen.csv',index_col='id')

## Adding metrics to edges information


In [None]:
edges_com = pd.DataFrame()
edges_com = edges.drop(['isFraud'],axis=1)

## Loading Graph Analysis metrics of origin and destination nodes in each edge

In [None]:
edges_com = pd.merge(edges_com, data,  how='left', left_on='nameOrig',right_on='id')
edges_com = edges_com.rename({'PageRank': 'origPagerank', 'Closeness Centrality': 'origCloseness','EigenVector Centrality':'origEigen'}, axis=1)
edges_com = pd.merge(edges_com, data,  how='left', left_on='nameDest',right_on='id')
edges_com = edges_com.rename({'PageRank': 'destPagerank', 'Closeness Centrality': 'destCloseness','EigenVector Centrality':'destEigen'}, axis=1)

## Converting Strings into Numbers

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

to_be_removed = ['nameDest','nameOrig']

categorical_features = ['type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())])

numeric_features = ['origPagerank', 'origCloseness','origEigen','destPagerank','destCloseness','destEigen']
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
       ('scaler', MinMaxScaler())])


In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    remainder = 'passthrough',
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features),
        ('remove','drop',to_be_removed)
])


edges_com = preprocessor.fit_transform(edges_com)


# Splitting Dataset into Train & Test Sets

### Calculate new train and test sets (Credit Card)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( edges.drop(['Class'], axis=1), edges['Class'] , test_size=0.25, random_state=42)


### Calculate new train and test sets(Synthetic Financial Datasets)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( edges_com, edges['isFraud'] , test_size=0.25, random_state=42)


# Supervised Machine Learning


## Logistic Regression

In [None]:
%%time
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(lr, X_test, y_test, values_format='d')

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_pred_lr, y_test))

## Random Forest (Bagging)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_features='auto', bootstrap=True)

rfc.fit(X_train,y_train) # build the tree of X_train

In [None]:
y_pred_rf = rfc.predict(X_test)
print(accuracy_score(y_pred_rf,y_test)) # Calculates the accuracy of Random Forest

### Plot the confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(rfc, X_test, y_test, values_format='d')


### Perform RandomForest model on our test set

In [None]:
y_test_pred = rfc.predict(X_test)
rfc.score(X_test,y_test)

### Perform AdaBoost

In [None]:
%%time
from sklearn.ensemble import AdaBoostClassifier


ada = AdaBoostClassifier(n_estimators=50, random_state=0, algorithm='SAMME.R')
ada.fit(X_train, y_train)

In [None]:
y_pred_ada = ada.predict(X_train)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(ada, X_test, y_test, values_format='d')

### Calculating probabilities of each model

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

rf_probs = rfc.predict_proba(X_test)
lr_probs = lr.predict_proba(X_test)
ada_probs = ada.predict_proba(X_test)

rf_probs = rf_probs[:,1]
lr_probs = lr_probs[:,1]
ada_probs = ada_probs[:,1]

### Calculating ROC & AUC

In [None]:
rf_auc = roc_auc_score(y_test, rf_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
ada_auc = roc_auc_score(y_test, ada_probs)

rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
ada_fpr, ada_tpr, _ = roc_curve(y_test, ada_probs)


### Plotting ROC Curve

In [None]:


plt.plot(lr_fpr, lr_tpr, linestyle='--', label='Linear Regression')
plt.plot(rf_fpr, rf_tpr, marker='.', label='Random Forest')
plt.plot(ada_fpr, ada_tpr, marker='.', label='Naive Bayes')

# Title
plt.title('ROC Plot')
# Axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# Show legend
plt.legend() # 
# Show plot
plt.show()



# Unsupervised Learning

## Synthetic Financial Datasets

### Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.05), \
                        max_features=16, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)

In [None]:
# About 8-10 minutes
clf.fit(X_train)

In [None]:
clf_pred = clf.predict(X_test)

#### Create Anomaly Histogram

In [None]:
df = pd.DataFrame()
df['scores'] = clf.decision_function(X_test)
df['anomaly_label'] = clf_pred

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
df['anomaly']=df['anomaly_label'].apply(lambda x: 'outlier' if x==-1  else 'inlier')
fig=px.histogram(df,x='scores',color='anomaly')
fig.show()

#### Confusion Matrix

In [None]:
res = np.where(clf_pred ==1,  0,clf_pred)
res = np.where(clf_pred ==-1,  1,res)
clf_pred = res

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt # graph visualisation
cm = confusion_matrix(Y_test,clf_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

#### After applying PCA on columns

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
pca = PCA(n_components=3)  # Reduce to k=3 dimensions
scaler = StandardScaler(with_mean=False)
#normalize the metrics
X = scaler.fit_transform(X_train)
X_train_reduce = pca.fit_transform(X)

In [None]:
scaler = StandardScaler()
#normalize the metrics
X = scaler.fit_transform(X_test)
X_test_reduce = pca.fit_transform(X)

In [None]:
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(.05), \
                            max_features=3, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)

In [None]:
clf.fit(X_train_reduce)

#### Create Anomaly Histogram

In [None]:
clf_pred_pca = clf.predict(X_test_reduce)

In [None]:
df = pd.DataFrame()
df['scores'] = clf.decision_function(X_test_reduce)
df['anomaly_label'] = clf_pred_pca

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
df['anomaly']=df['anomaly_label'].apply(lambda x: 'outlier' if x==-1  else 'inlier')
fig=px.histogram(df,x='scores',color='anomaly')
fig.show()

#### Confusion Matrix

In [None]:
res_pca = np.where(clf_pred_pca ==1,  0,clf_pred_pca)
res_pca = np.where(clf_pred_pca ==-1,  1,res_pca)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt # graph visualisation
cm = confusion_matrix(Y_test,res_pca)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

### K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
clustering = KMeans(n_clusters=2)

In [None]:
clustering.fit(X_train)

In [None]:
km_pred = clustering.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt # graph visualisation
cm = confusion_matrix(Y_test,km_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

#### After applying PCA on columns 

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
pca = PCA(n_components=3)  # Reduce to k=3 dimensions
scaler = StandardScaler()
#normalize the metrics
X = scaler.fit_transform(X_train)
X_train_reduce = pca.fit_transform(X)

In [None]:
scaler = StandardScaler()
#normalize the metrics
X = scaler.fit_transform(X_test)
X_test_reduce = pca.fit_transform(X)

In [None]:
from sklearn.cluster import KMeans
clustering = KMeans(n_clusters=2, random_state=0,algorithm="elkan",max_iter=10000)
clustering.fit(X_train_reduce)
km_pca_pred = clustering.predict(X_test_reduce)

#### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt # graph visualisation
cm = confusion_matrix(Y_test,km_pca_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

### ROC Curve & AUROC Score

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
clf_auc = roc_auc_score(Y_test, clf_pred)
clf_pca_auc = roc_auc_score(Y_test, clf_pred_pca)
km_auc = roc_auc_score(Y_test, km_pred)
km_pca_auc = roc_auc_score(Y_test, km_pca_pred)

In [None]:
print('Isolation Forest: AUROC = %.3f' % (clf_auc))
print('Isolation Forest PCA: AUROC = %.3f' % (clf_pca_auc))
print('K-Means: AUROC = %.3f' % (km_auc))
print('K-Means PCA: AUROC = %.3f' % (km_pca_auc))

In [None]:
clf_fpr, clf_tpr, _ = roc_curve(Y_test, clf_pred)
clf_pca_fpr, clf_pca_tpr, _ = roc_curve(Y_test, clf_pred_pca)
km_fpr, km_tpr, _ = roc_curve(Y_test, km_pred)
km_pca_fpr, km_pca_tpr, _ = roc_curve(Y_test, km_pca_pred)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(clf_fpr, clf_tpr, linestyle='--', label='Isolation Forrest (AUROC = %0.3f)' % clf_auc)
plt.plot(clf_pca_fpr, clf_pca_tpr, marker='.', label='Isolation Forrest PCA (AUROC = %0.3f)' % clf_pca_auc)
plt.plot(km_fpr, km_tpr, marker='.', label='K-Means (AUROC = %0.3f)' % km_auc)
plt.plot(km_pca_fpr, km_pca_tpr, marker='.', label='K-Means PCA (AUROC = %0.3f)' % km_pca_auc)

# Title
plt.title('ROC Plot')
# Axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# Show legend
plt.legend() # 
# Show plot
plt.show()

## Credit Card Fraud System with 28 PCA Columns

In [None]:
from sklearn.metrics import classification_report,accuracy_score

### Isolation Forest

In [None]:
clf=IsolationForest(n_estimators=100, max_samples='auto', contamination=0.1, \
                        max_features=28, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)

In [None]:
clf.fit(X_train)
clf_pred = clf.predict(X_test)

In [None]:
df = pd.DataFrame()
df['scores'] = clf.decision_function(X_test)
df['anomaly_label'] = clf_pred

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
df['anomaly']=df['anomaly_label'].apply(lambda x: 'outlier' if x==-1  else 'inlier')
fig=px.histogram(df,x='scores',color='anomaly')
fig.show()

In [None]:
res = np.where(clf_pred ==1,  0,clf_pred)
res = np.where(clf_pred ==-1,  1,res)
clf_pred = res

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt # graph visualisation
cm = confusion_matrix(Y_test,clf_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

### KMeans

In [None]:
clustering = KMeans(n_clusters=2, random_state=42)
clustering.fit(X_train)
km_pred = clustering.predict(X_test)

In [None]:
n_errors = (km_pred != Y_test).sum()
# Run Classification Metrics
print("{}: {}".format('isolation forrest',n_errors))
print("Accuracy Score :")
print(accuracy_score(Y_test,km_pred))
print("Classification Report :")
print(classification_report(Y_test,km_pred))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt # graph visualisation
cm = confusion_matrix(Y_test,km_pred)

plt.figure(figsize=(10,7))
sn.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')

###  ROC Curve & AUROC Score

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
clf_auc = roc_auc_score(Y_test, clf_pred)
km_auc = roc_auc_score(Y_test, km_pred)

In [None]:
print('Isolation Forrest: AUROC = %.3f' % (clf_auc))
print('K-Means: AUROC = %.3f' % (km_auc))

In [None]:
clf_fpr, clf_tpr, _ = roc_curve(Y_test, clf_pred)
km_fpr, km_tpr, _ = roc_curve(Y_test, km_pred)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(clf_fpr, clf_tpr, linestyle='--', label='Isolation Forest (AUROC = %0.3f)' % clf_auc)
plt.plot(km_fpr, km_tpr, marker='.', label='K-Means (AUROC = %0.3f)' % km_auc)

# Title
plt.title('ROC Plot')
# Axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# Show legend
plt.legend() # 
# Show plot
plt.show()