## Sampling

In [1]:
import numpy as np
import pandas as pd
df_sample=pd.read_csv(r"C:\Users\23059\OneDrive\Desktop\Amiira\Y3S1\fyp\sample.csv")

In [2]:
df_sample.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [3]:
df_sample['type'] = df_sample['type'].replace({'CASH_OUT': 0, 'CASH_IN': 1, 'PAYMENT': 2, 'TRANSFER': 3, 'DEBIT': 4})

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(df_sample['nameDest'])
label
df_sample.drop("nameDest", axis=1, inplace=True)
df_sample["nameDest"] = label

le = LabelEncoder()
label = le.fit_transform(df_sample['nameOrig'])
label
df_sample.drop("nameOrig", axis=1, inplace=True)
df_sample["nameOrig"] = label

df_sample['type'] = df_sample['type'].replace({'CASH_OUT': 0, 'CASH_IN': 1, 'PAYMENT': 2, 'TRANSFER': 3, 'DEBIT': 4})

In [5]:
X = df_sample.drop('isFraud', axis=1)
# Separate the target variable
y = df_sample['isFraud']

In [6]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# #Upsampling via SMOTE
# smote = SMOTE(sampling_strategy=0.4, random_state=0)

# #Downsample via RandomUnderSampler
# rus = RandomUnderSampler(sampling_strategy=0.9, random_state=0)

#Upsampling via SMOTE
smote = SMOTE(sampling_strategy=0.55, random_state=0)

#Application of the resampling methods
X_resampled, y_resampled = smote.fit_resample(X, y)
# X_resampled, y_resampled = rus.fit_resample(X_resampled, y_resampled)
# from imblearn.over_sampling import SMOTE

# #Upsampling via SMOTE
# smote = SMOTE(sampling_strategy=0.55, random_state=0)

# # Fit and apply the resampler to the entire dataset
# X_resampled, y_resampled = smote.fit_resample(X, y)

In [7]:
from imblearn.under_sampling import TomekLinks

#Resample using TomekLinks first
tomek_links = TomekLinks(sampling_strategy='majority')
X_train_resampled, y_train_resampled = tomek_links.fit_resample(X_resampled, y_resampled)

In [8]:
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours,OneSidedSelection
# resample the output of TomekLinks using EditedNearestNeighbours
enn = EditedNearestNeighbours(sampling_strategy='majority')
X_train_resampled_new, y_train_resampled_new = enn.fit_resample(X_train_resampled, y_train_resampled)

In [9]:
from imblearn.under_sampling import OneSidedSelection
# resample the output of EditedNearestNeighbours using One-Sided Selection
oss = OneSidedSelection(sampling_strategy='majority')
X_train_resampled_final, y_train_resampled_final = oss.fit_resample(X_train_resampled_new, y_train_resampled_new)

In [10]:
# reverse_map = {0: 'CASH_OUT', 1: 'CASH_IN', 2: 'PAYMENT', 3: 'TRANSFER', 4: 'DEBIT'}
# df_sample['type'] = df_sample['type'].replace(reverse_map)

In [11]:
combined_data = pd.concat([X_train_resampled_final, y_train_resampled_final], axis=1)

In [12]:
reverse_map = {0: 'CASH_OUT', 1: 'CASH_IN', 2: 'PAYMENT', 3: 'TRANSFER', 4: 'DEBIT'}
combined_data ['type'] = combined_data ['type'].replace(reverse_map)

In [13]:
from sklearn.cluster import KMeans

# This code transforms the continuous variables (step and amount) taking into account ‘isFraud Class’ which is the objective  
# of this research. A k-means cluster model with 3 features, ‘isFraud’, ‘step’ and ‘amount’ is used.
# The variables ‘step_cluster’ and ‘amount_cluster’ are created for every single transaction.

continuous_vars = ['step','amount','isFraud']

# Normalizing the data
normalized_df = (combined_data[continuous_vars] - combined_data[continuous_vars].mean()) / combined_data[continuous_vars].std()

# Determining the optimal number of clusters
kmeans_kwargs = {"init": "random", "n_init": 15, "max_iter": 400, "random_state": 14}
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs,algorithm='full')
    kmeans.fit(normalized_df)
    sse.append(kmeans.inertia_)
    
# # Application of th elbow method 
# import matplotlib.pyplot as plt
# plt.style.use("fivethirtyeight")
# plt.plot(range(1, 11), sse)
# plt.xticks(range(1, 11))
# plt.xlabel("Number of Clusters")
# plt.ylabel("SSE")
# plt.show()

# Application of K-Means clustering
kmeans = KMeans(n_clusters=5, **kmeans_kwargs,algorithm='full',tol=0.0010)
kmeans.fit(normalized_df)
clusters = kmeans.predict(normalized_df)

# Using K-Means clustering to create clusters for the continuous variables "step", "amount" while taking "isFraud" into consideration
cluster_df = pd.DataFrame()
for var in continuous_vars:
    cluster_centers = kmeans.cluster_centers_[:, continuous_vars.index(var)]
    col_name = var + '_cluster'
    cluster_df[col_name] = kmeans.predict(normalized_df)
    
    # Replace cluster labels with desired values
    
    if var == 'isFraud':
        cluster_df[col_name] = cluster_df[col_name].apply(lambda x: 'Fraud' if x == 1 else 'No_Fraud')
  
    elif var=='amount':
        bin_edges = [0, 50000, 100000, 500000, 1000000, 5000000, 10000000, 60000000, 70000000]
        cluster_labels = ['0-50k', '50k-100k', '100k-500k', '500k-1M', '1M-5M', '5M-10M', '10M-60M', '60M-70M']
        cluster_df[col_name] = pd.cut(combined_data['amount'], bins=bin_edges, labels=cluster_labels)
   
    elif var == 'step':
        bin_edges = [1, 186, 372, 558, 743] 
        cluster_labels = ['1-186', '187-372', '373-558', '559-743']
        cluster_df[col_name] = pd.cut(combined_data['step'], bins=bin_edges, labels=cluster_labels)

# Create new dataframe to store the converted values
K_means_df = pd.DataFrame()
K_means_df = pd.concat([K_means_df, cluster_df], axis=1)

In [14]:
# from sklearn.metrics import silhouette_score

# silhouette_score = silhouette_score(normalized_df, clusters)
# print("Silhouette Score:", silhouette_score)

In [15]:
inertia = kmeans.inertia_
print("Inertia:", inertia)

Inertia: 490617.0471558473


In [16]:
from sklearn.metrics import davies_bouldin_score

davies_bouldin_score = davies_bouldin_score(normalized_df, clusters)
print("Davies-Bouldin Index:", davies_bouldin_score)

Davies-Bouldin Index: 0.660712106013662


In [17]:
K_means_df

Unnamed: 0,step_cluster,amount_cluster,isFraud_cluster
0,187-372,0-50k,Fraud
1,187-372,100k-500k,Fraud
2,1-186,100k-500k,No_Fraud
3,187-372,0-50k,Fraud
4,187-372,0-50k,Fraud
...,...,...,...
1006127,1-186,0-50k,No_Fraud
1006128,373-558,1M-5M,No_Fraud
1006129,373-558,0-50k,No_Fraud
1006130,187-372,1M-5M,No_Fraud


In [18]:
from tabulate import tabulate
from termcolor import colored

# Get the counts for each cluster label
amount_cluster_counts = cluster_df['amount_cluster'].value_counts()
step_cluster_counts = cluster_df['step_cluster'].value_counts()
isFraud_cluster_counts = cluster_df['isFraud_cluster'].value_counts()

# Print the counts with labels
print(colored("Amount cluster counts:", 'blue'))
print(tabulate(amount_cluster_counts.to_frame().T, headers='keys', tablefmt='fancy_grid', numalign='center', stralign='center', colalign=("center",), showindex=False, disable_numparse=True))

print(colored("\nStep cluster counts:", 'blue'))
print(tabulate(step_cluster_counts.to_frame().T, headers='keys', tablefmt='fancy_grid', numalign='center', stralign='center', colalign=("center",), showindex=False, disable_numparse=True))

print(colored("\nIsFraud cluster counts:", 'blue'))
print(tabulate(isFraud_cluster_counts.to_frame().T, headers='keys', tablefmt='fancy_grid', numalign='center', stralign='center', colalign=("center",), showindex=False, disable_numparse=True))

Amount cluster counts:
╒═════════════╤═════════╤════════════╤═════════╤═══════════╤══════════╤═══════════╤═══════════╕
│  100k-500k  │  0-50k  │  50k-100k  │  1M-5M  │  500k-1M  │  5M-10M  │  10M-60M  │  60M-70M  │
╞═════════════╪═════════╪════════════╪═════════╪═══════════╪══════════╪═══════════╪═══════════╡
│   391038    │ 283620  │   109947   │  95700  │   83302   │  42251   │    270    │     3     │
╘═════════════╧═════════╧════════════╧═════════╧═══════════╧══════════╧═══════════╧═══════════╛

Step cluster counts:
╒═══════════╤═════════╤═══════════╤═══════════╕
│  187-372  │  1-186  │  373-558  │  559-743  │
╞═══════════╪═════════╪═══════════╪═══════════╡
│  420208   │ 285301  │  211682   │   88699   │
╘═══════════╧═════════╧═══════════╧═══════════╛

IsFraud cluster counts:
╒════════════╤═════════╕
│  No_Fraud  │  Fraud  │
╞════════════╪═════════╡
│   697962   │ 308170  │
╘════════════╧═════════╛


In [19]:
# Check the data types of columns after conversion
print(K_means_df.dtypes)

step_cluster       category
amount_cluster     category
isFraud_cluster      object
dtype: object


In [20]:
K_means_df= pd.concat([combined_data['type'].reset_index(drop=True), K_means_df], axis=1)

In [21]:
# Change data types to object
K_means_df = K_means_df .astype('object')

# Check the data types of columns after conversion
print(K_means_df .dtypes)

type               object
step_cluster       object
amount_cluster     object
isFraud_cluster    object
dtype: object


### Split

In [22]:
X = K_means_df.drop('isFraud_cluster', axis=1)
# Separate the target variable
y = K_means_df['isFraud_cluster']

from sklearn.model_selection import train_test_split

# Print class distribution before split
print(y.value_counts(normalize=True))

# Assuming X contains your features and y contains your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.1, stratify=y, random_state=2)

# Print class distribution after split
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

No_Fraud    0.693708
Fraud       0.306292
Name: isFraud_cluster, dtype: float64
No_Fraud    0.693708
Fraud       0.306292
Name: isFraud_cluster, dtype: float64
No_Fraud    0.693711
Fraud       0.306289
Name: isFraud_cluster, dtype: float64


In [23]:
analysis_data = pd.concat([X_train, y_train], axis=1)

In [24]:
# test_data = pd.concat([X_test, y_test], axis=1)
test_data = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
test_data

Unnamed: 0,type,step_cluster,amount_cluster,isFraud_cluster
0,PAYMENT,1-186,0-50k,No_Fraud
1,TRANSFER,373-558,100k-500k,No_Fraud
2,CASH_OUT,187-372,100k-500k,Fraud
3,TRANSFER,1-186,100k-500k,No_Fraud
4,CASH_OUT,559-743,0-50k,Fraud
...,...,...,...,...
100609,CASH_IN,1-186,50k-100k,No_Fraud
100610,TRANSFER,559-743,500k-1M,No_Fraud
100611,CASH_IN,373-558,0-50k,Fraud
100612,CASH_OUT,187-372,100k-500k,No_Fraud


In [25]:
# K_means_df= pd.concat([test_data['type'].reset_index(drop=True), K_means_df], axis=1)

In [26]:
# Change data types to object
K_means_df= K_means_df.astype('object')

# Check the data types of columns after conversion
print(K_means_df.dtypes)

type               object
step_cluster       object
amount_cluster     object
isFraud_cluster    object
dtype: object


In [28]:
# Predict for all rows in the test data
predictions = []
TP = 0  # True Positives
FP = 0  # False Positives
TN = 0  # True Negatives
FN = 0  # False Negatives

for i in range(len(test_data)):
    test_instance = test_data.iloc[i]
    prediction = chef.predict(model, test_instance)
    predictions.append(prediction)

   # Compare the prediction with the actual target value
    if prediction == "Fraud" and y_test.iloc[i] == "Fraud":
        TP += 1
    elif prediction == "Fraud" and y_test.iloc[i] == "No_Fraud":
        FP += 1
    elif prediction == "No_Fraud" and y_test.iloc[i] == "No_Fraud":
        TN += 1
    elif prediction == "No_Fraud" and y_test.iloc[i] == "Fraud":
        FN += 1

# Print the predictions
print(predictions)

# Print the counts for TP, FP, TN, and FN
print("True Positives (TP):", TP)
print("False Positives (FP):", FP)
print("True Negatives (TN):", TN)
print("False Negatives (FN):", FN)

['No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'No_Fraud', 'Fraud', 'No_Fraud', 'Fraud', 'Fraud', 'Fraud', 'Fraud', 'Fraud', 'Fraud', 'No_Fraud', 

In [29]:
# Calculate precision
precision = TP / (TP + FP)

# Calculate recall
recall = TP / (TP + FN)

# Calculate F1 score
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.6580245212839234
Recall: 0.7749943213161566
F1 Score: 0.7117356061509119
