In [1]:

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset



In [2]:
# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
non_fraud_df = pd.read_csv('data/non_fraud_df.csv')
non_fraud_df.head()

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use_Chip,Merchant_Name,Merchant_City,...,Zip,MCC,Errors,Is_Fraud,Hour,Minute,DateTime,DayOfWeek,City_Transactions,State_Transactions
0,0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,...,91750.0,5300,,No,6,21,2002-09-01 06:21:00,6,10912,2591830.0
1,0,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,...,91754.0,5411,,No,6,42,2002-09-01 06:42:00,6,7319,2591830.0
2,0,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,...,91754.0,5411,,No,6,22,2002-09-02 06:22:00,0,7319,2591830.0
3,0,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,...,91754.0,5651,,No,17,45,2002-09-02 17:45:00,0,7319,2591830.0
4,0,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,...,91750.0,5912,,No,6,23,2002-09-03 06:23:00,1,10912,2591830.0


In [4]:
non_fraud_df.drop(non_fraud_df.columns[0], axis=1,inplace=True)
non_fraud_df.head()

Unnamed: 0,Card,Year,Month,Day,Time,Amount,Use_Chip,Merchant_Name,Merchant_City,Merchant_State,Zip,MCC,Errors,Is_Fraud,Hour,Minute,DateTime,DayOfWeek,City_Transactions,State_Transactions
0,0,2002,9,1,06:21,$134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,5300,,No,6,21,2002-09-01 06:21:00,6,10912,2591830.0
1,0,2002,9,1,06:42,$38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No,6,42,2002-09-01 06:42:00,6,7319,2591830.0
2,0,2002,9,2,06:22,$120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,5411,,No,6,22,2002-09-02 06:22:00,0,7319,2591830.0
3,0,2002,9,2,17:45,$128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,5651,,No,17,45,2002-09-02 17:45:00,0,7319,2591830.0
4,0,2002,9,3,06:23,$104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,5912,,No,6,23,2002-09-03 06:23:00,1,10912,2591830.0


In [5]:
## Let's remove the white space in the columns and replace them with underscore
non_fraud_df.columns = [x.replace(" ", "_").replace("?", "") for x in non_fraud_df.columns]
non_fraud_df.columns

Index(['Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 'Use_Chip',
       'Merchant_Name', 'Merchant_City', 'Merchant_State', 'Zip', 'MCC',
       'Errors', 'Is_Fraud', 'Hour', 'Minute', 'DateTime', 'DayOfWeek',
       'City_Transactions', 'State_Transactions'],
      dtype='object')

In [6]:
non_fraud_df.shape

(24357143, 20)

## Undersampling

In [7]:
# Set the desired size of the subsample
subsample_size = 500000

# Create a random subsample of the DataFrame
subsample_df = non_fraud_df.sample(n=subsample_size, random_state=42)

# Print the shape of the subsample DataFrame
print(subsample_df.shape)

(500000, 20)


In [8]:
def feature_eng(df):
    """ Function to fill nulls and label encode the following columns: 'Errors', 'Use_Chip', 'Merchant_City', 'Merchant_State' """
    cols_to_encode = ['Errors', 'Use_Chip', 'Merchant_City', 'Merchant_State']
    trans = df.copy()
    labelencoder = LabelEncoder()

    # label encoding categrical column values
    for col in cols_to_encode:
        trans[col].fillna("0", inplace = True)
        trans[col + '_Encoded'] = labelencoder.fit_transform(trans[col]) ## Label encoding all stated columns so that later our algorithm can understand the numerical data

    # Remove non-numeric characters from the 'Amount' column
    trans['Amount'] = trans['Amount'].replace('[^\d.]', '', regex=True).astype(float)

    trans.drop(cols_to_encode, axis=1, inplace=True) ## Dropping the original columns, since we have now created new encoded columns

    trans['Zip'].fillna(0, inplace = True) ## Filling all null values in the zip column with 0
    trans['Time'] = trans['Time'].str[:2].astype('int')

    return trans

non_fraud_new = feature_eng(subsample_df)
non_fraud_new.head()

Unnamed: 0,Card,Year,Month,Day,Time,Amount,Merchant_Name,Zip,MCC,Is_Fraud,Hour,Minute,DateTime,DayOfWeek,City_Transactions,State_Transactions,Errors_Encoded,Use_Chip_Encoded,Merchant_City_Encoded,Merchant_State_Encoded
17709708,1,2012,11,24,14,137.46,-5920216026753360007,75043.0,4111,No,14,19,2012-11-24 14:19:00,5,64726,1793298.0,0,2,2605,133
4379891,0,2016,11,21,6,162.71,3939208886321873731,21144.0,5912,No,6,24,2016-11-21 06:24:00,0,19705,377134.0,0,0,6523,76
18126888,0,2019,3,23,17,26.55,6577593968895566254,92805.0,5813,No,17,42,2019-03-23 17:42:00,5,28095,2591830.0,0,0,145,17
19933384,0,2011,12,5,4,17.1,1913477460590765860,8012.0,5300,No,4,40,2011-12-05 04:40:00,0,5065,630317.0,0,2,605,95
14183458,1,2016,5,1,11,77.0,-1288082279022882052,80013.0,5499,No,11,57,2016-05-01 11:57:00,6,49607,273730.0,0,0,292,18


In [9]:
non_fraud_new.shape

(500000, 20)

In [10]:
fraud_df = pd.read_csv('data/fraud.csv')
fraud_df.head(3)

Unnamed: 0,User,Card,Year,Month,Day,Time,Amount,Use_Chip,Merchant_Name,Merchant_City,...,Zip,MCC,Errors,Is_Fraud,Hour,Minute,DateTime,DayOfWeek,City_Transactions,State_Transactions
0,0,0,2015,11,15,12:55,$287.13,Online Transaction,-8194607650924472520,ONLINE,...,,3001,,Yes,12,55,2015-11-15 12:55:00,6,2720821,
1,0,0,2015,11,15,13:19,$2.41,Online Transaction,-7759074308363763111,ONLINE,...,,5651,,Yes,13,19,2015-11-15 13:19:00,6,2720821,
2,0,0,2015,11,16,09:41,$50.81,Online Transaction,-551332107213382088,ONLINE,...,,4411,,Yes,9,41,2015-11-16 09:41:00,0,2720821,


In [11]:
fraud_df = feature_eng(fraud_df.copy())

In [31]:
combined_df = pd.concat([fraud_df, non_fraud_new], axis=0)

# Reset the index of the combined dataframe
combined_df = combined_df.reset_index(drop=True)
combined_df.shape

(529757, 21)

## **Gaussian Mixture Model**

In [32]:
from sklearn.mixture import GaussianMixture


def build_model(df):

    
    # Create feature matrix X
    X = df.values

    # Specify the number of components for GMM
    n_components = 3  # Adjust as per your requirement

    # Fit a GMM to the data
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(X)

    return gmm


In [35]:
feature_list = ['User', 'Card', 'Amount']

train_data = combined_df[feature_list]
train_data = train_data.dropna()

model = build_model(train_data)

In [36]:
train_data.shape

(29757, 3)

In [38]:

# Predict the cluster labels for the data
labels = model.predict(train_data)

# Add the cluster labels as a new column in the combined dataframe
train_data['Cluster'] = labels



## **Evaluation**

In [41]:
from sklearn.metrics import silhouette_score,adjusted_rand_score

# Evaluate the clustering performance using silhouette score
silhouette_avg = silhouette_score(train_data, labels)
print(f"Silhouette Score: {silhouette_avg}")


Silhouette Score: -0.0057121806798297444


In [46]:
import plotly.express as px

# Assuming you have a dataframe named 'combined_df' with columns 'User', 'Card', 'Amount', and 'Cluster'

# Create a 3D scatter plot
fig = px.scatter_3d(train_data, x='User', y='Card', z='Amount', color='Cluster')

# Set labels and title
fig.update_layout(
    scene=dict(
        xaxis_title='User',
        yaxis_title='Card',
        zaxis_title='Amount'
    ),
    title='3D Scatter Plot of Clusters'
)

# Show the plot
fig.show()
