In this Notebook, My aim is to implement an unsupervised machine learning approach using a one- class of Support Vector Machine (SVM) and Autoencoder algorithms. The objective is to identify fraudulent transactions within the dataset

### Feature Selection

In [1]:
import pandas as pd
data = pd.read_csv('palpay data new.csv')

# List of columns will be included in the modeling 
selected_columns = ['Transaction Date ', 'Card Type', 'Channel ', 'Transaction Type',
                    'Transaction Type Group', 'Entry Mode', 'Transaction Status ',
                    'Merchant Country', 'Merchant Activity ', 'Amount USD']

# Create a new DataFrame with the selected columns
selected_data = data[selected_columns]
selected_data['Merchant Activity '] = selected_data['Merchant Activity '].str.replace('other', 'Other', case=False)
selected_data['Transaction Date '] = pd.to_datetime(selected_data['Transaction Date '])
# Print the new DataFrame
print("Selected Data:")
print(selected_data)

Selected Data:
      Transaction Date            Card Type Channel  Transaction Type  \
0            2023-01-01  Visa Classic Debit      ATM       Withdrawal   
1            2023-01-01  Visa Classic Debit      ATM       Withdrawal   
2            2023-01-01  Visa Classic Debit      POS         Purchase   
3            2023-01-01  Visa Classic Debit      ATM       Withdrawal   
4            2023-01-01  Visa Classic Debit      ATM       Withdrawal   
...                 ...                 ...      ...              ...   
82629        2023-06-30  Visa Classic Debit      POS         Purchase   
82630        2023-06-30  Visa Classic Debit      ATM       Withdrawal   
82631        2023-06-30  Visa Classic Debit      ATM       Withdrawal   
82632        2023-06-30  Visa Classic Debit      POS         Purchase   
82633        2023-06-30  Visa Classic Debit      POS         Purchase   

      Transaction Type Group  Entry Mode Transaction Status  Merchant Country  \
0           ATM Transaction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Merchant Activity '] = selected_data['Merchant Activity '].str.replace('other', 'Other', case=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Transaction Date '] = pd.to_datetime(selected_data['Transaction Date '])


### Data Normalization 

In [2]:
amount_usd = selected_data['Amount USD']
min_amount_usd = amount_usd.min()
max_amount_usd = amount_usd.max()
selected_data['Amount USD'] = (amount_usd - min_amount_usd) / (max_amount_usd - min_amount_usd)

print("\nAfter normalization:")
print(selected_data['Amount USD'])


After normalization:
0        0.536271
1        0.545326
2        0.547590
3        0.500053
4        0.543062
           ...   
82629    0.549036
82630    0.455593
82631    0.392753
82632    0.549466
82633    0.545722
Name: Amount USD, Length: 82634, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Amount USD'] = (amount_usd - min_amount_usd) / (max_amount_usd - min_amount_usd)


### One-Hot Encoding For the categorical variables

In [3]:
import pandas as pd

# List of categorical columns to encode
categorical_columns = ['Card Type', 'Channel ', 'Transaction Type Group', 'Entry Mode', 'Transaction Status ','Transaction Type','Merchant Activity ']

# Apply One-Hot Encoding to the categorical columns
encoded_data = pd.get_dummies(selected_data, columns=categorical_columns)

# Clean 'Merchant Country' column by mapping to regions
country_to_region = {
    'egypt': 'Middle East/North Africa',
    'netherlands': 'Europe',
    'visa_ie': 'Europe',
    'united states': 'North America',
    'germany,federal repu': 'Europe',
    'luxembourg': 'Europe',
    'united kingdom': 'Europe',
    'israel': 'Middle East',
    'singapore': 'Asia',
    'jordan': 'Middle East/North Africa',
    'cyprus': 'Europe',
    'france': 'Europe',
    'czech republic': 'Europe',
    'hong kong': 'Asia',
    'qatar': 'Middle East',
    'austria': 'Europe',
    'lebanon': 'Middle East/North Africa',
    'turkey': 'Middle East',
    'united arab emirates': 'Middle East',
    'sweden': 'Europe',
    'switzerland': 'Europe',
    'spain': 'Europe',
    'australia': 'Oceania',
    'malta': 'Europe',
    'pakistan': 'Asia',
    'romania': 'Europe',
    'iraq': 'Middle East',
    'poland': 'Europe',
    'kazakhstan': 'Asia',
    'bulgaria': 'Europe',
    'thailand': 'Asia',
    'india': 'Asia',
    'lithuania': 'Europe',
    'finland': 'Europe',
    'canada': 'North America',
    'saudi arabia': 'Middle East',
    'japan': 'Asia',
    'morocco': 'Middle East/North Africa',
    'hungary': 'Europe',
    'kenya': 'Africa',
    'estonia': 'Europe',
    'rwanda': 'Africa',
    'malaysia': 'Asia',
    'denmark': 'Europe',
    'italy': 'Europe',
    'brazil': 'South America',
    'nigeria': 'Africa',
    'kuwait': 'Middle East',
    'cambodia': 'Asia',
    'moldova, rep. of': 'Europe',
    'gibraltar': 'Europe',
    'slovenia': 'Europe',
    'ukrainian ssr': 'Europe',
}
encoded_data['Merchant Country'] = encoded_data['Merchant Country'].str.strip().str.lower()
encoded_data['Merchant Country'] = encoded_data['Merchant Country'].map(country_to_region)

# Apply One-Hot Encoding to the 'Merchant Country' column to create region-based columns
encoded_data = pd.get_dummies(encoded_data, columns=['Merchant Country'], prefix='Region')

# Print the encoded data
print("Encoded Data:")
print(encoded_data.head())

Encoded Data:
  Transaction Date   Amount USD  Card Type_Visa Classic Debit  \
0        2023-01-01    0.536271                             1   
1        2023-01-01    0.545326                             1   
2        2023-01-01    0.547590                             1   
3        2023-01-01    0.500053                             1   
4        2023-01-01    0.543062                             1   

   Card Type_Visa Platinum Debit  Card Type_Visa Virtual Debit  Channel _ATM  \
0                              0                             0             1   
1                              0                             0             1   
2                              0                             0             0   
3                              0                             0             1   
4                              0                             0             1   

   Channel _E-Commerce  Channel _POS  Transaction Type Group_ATM Transactions  \
0                    0           

In [4]:
print(encoded_data.columns)

Index(['Transaction Date ', 'Amount USD', 'Card Type_Visa Classic Debit',
       'Card Type_Visa Platinum Debit', 'Card Type_Visa Virtual Debit',
       'Channel _ATM', 'Channel _E-Commerce', 'Channel _POS',
       'Transaction Type Group_ATM Transactions',
       'Transaction Type Group_Purchases', 'Entry Mode_Chip & Pin',
       'Entry Mode_E-commerce', 'Entry Mode_Magnetic Stripe',
       'Entry Mode_Manual', 'Entry Mode_Other', 'Entry Mode_Paywave',
       'Transaction Status _PROCESSED', 'Transaction Status _REVERSED',
       'Transaction Type_Deposit', 'Transaction Type_Purchase',
       'Transaction Type_Withdrawal',
       'Merchant Activity _Advertising services',
       'Merchant Activity _Cash withdrawal from the ATM',
       'Merchant Activity _Clothing & Jewelry',
       'Merchant Activity _Electronics',
       'Merchant Activity _Entertainment & Restaurants',
       'Merchant Activity _Fuel Stations',
       'Merchant Activity _Government and institutional services',
    

In [5]:
print(encoded_data.dtypes)

Transaction Date                                            datetime64[ns]
Amount USD                                                         float64
Card Type_Visa Classic Debit                                         uint8
Card Type_Visa Platinum Debit                                        uint8
Card Type_Visa Virtual Debit                                         uint8
Channel _ATM                                                         uint8
Channel _E-Commerce                                                  uint8
Channel _POS                                                         uint8
Transaction Type Group_ATM Transactions                              uint8
Transaction Type Group_Purchases                                     uint8
Entry Mode_Chip & Pin                                                uint8
Entry Mode_E-commerce                                                uint8
Entry Mode_Magnetic Stripe                                           uint8
Entry Mode_Manual        

### One-Class SVM Algorithm

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM

# Drop 'Transaction Date' column as it may not directly contribute to anomaly detection
encoded_data = encoded_data.drop(['Transaction Date '], axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test = train_test_split(encoded_data, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply One-Class SVM
ocsvm = OneClassSVM(nu=0.05, kernel='rbf', gamma='scale')  
ocsvm.fit(X_train_scaled)

# Predict on the test set
predictions = ocsvm.predict(X_test_scaled)

# Evaluate the model (based on percentage of predicted anomalies)
anomaly_percentage = (sum(predictions == -1) / len(predictions)) * 100

print("Percentage of anomalies:", anomaly_percentage)

Percentage of anomalies: 5.094693531796454


### Apply  Autoencoders (for neural network-based anomaly detection))

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from keras.models import Model, Sequential
from keras.layers import Input, Dense


# Drop 'Transaction Date' column as it may not directly contribute to anomaly detection
# encoded_data = encoded_data.drop(['Transaction Date '], axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test = train_test_split(encoded_data, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train the Autoencoder
input_dim = X_train_scaled.shape[1]

autoencoder = Sequential()
autoencoder.add(Dense(128, activation='relu', input_dim=input_dim))
autoencoder.add(Dense(64, activation='relu'))
autoencoder.add(Dense(128, activation='relu'))
autoencoder.add(Dense(input_dim, activation='linear'))

autoencoder.compile(optimizer='adam', loss='mean_squared_error')

autoencoder.fit(X_train_scaled, X_train_scaled, epochs=50, batch_size=32, shuffle=True, validation_split=0.1)

# Evaluate the Autoencoder on the test set
X_test_pred = autoencoder.predict(X_test_scaled)
mse = mean_squared_error(X_test_scaled, X_test_pred)

print("Mean Squared Error:", mse)

# Set a threshold for anomaly detection based on reconstruction error
threshold = mse * 2

# Predict anomalies based on the threshold
anomalies = np.where(mse > threshold)[0]

print("Indices of detected anomalies:", anomalies)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error: 0.0025344921726770673
Indices of detected anomalies: []


In [18]:
# Save the trained model and scaler
autoencoder.save("autoencoder_model.h5")
joblib.dump(scaler, "scaler.pkl")

# Load the model and scaler
loaded_model = load_model("autoencoder_model.h5")
loaded_scaler = joblib.load("scaler.pkl")

# Define a function for fraud detection using the loaded model and scaler
def detect_fraud(model, scaler, transactions):
    transactions_scaled = scaler.transform(transactions)
    transactions_pred = model.predict(transactions_scaled)
    mse = np.mean(np.power(transactions_scaled - transactions_pred, 2), axis=1)
    
    threshold = mse * 2  # You can adjust the threshold as needed
    fraud_indices = np.where(mse > threshold)[0]
    
    fraud_predictions = np.zeros(len(transactions))
    fraud_predictions[fraud_indices] = 1
    
    return fraud_predictions

  saving_api.save_model(
