In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow import keras

In [2]:
amlm = pd.read_csv("HI-Small_Trans.csv")
amlm

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.340000,US Dollar,3697.340000,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.010000,US Dollar,0.010000,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.570000,US Dollar,14675.570000,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.970000,US Dollar,2806.970000,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.970000,US Dollar,36682.970000,US Dollar,Reinvestment,0
...,...,...,...,...,...,...,...,...,...,...,...
5078340,2022/09/10 23:57,54219,8148A6631,256398,8148A8711,0.154978,Bitcoin,0.154978,Bitcoin,Bitcoin,0
5078341,2022/09/10 23:35,15,8148A8671,256398,8148A8711,0.108128,Bitcoin,0.108128,Bitcoin,Bitcoin,0
5078342,2022/09/10 23:52,154365,8148A6771,256398,8148A8711,0.004988,Bitcoin,0.004988,Bitcoin,Bitcoin,0
5078343,2022/09/10 23:46,256398,8148A6311,256398,8148A8711,0.038417,Bitcoin,0.038417,Bitcoin,Bitcoin,0


In [3]:
#randomly selecting 1000000 records
#amlms = amlm.sample(n=1000000, random_state=42)
amlms = amlm

In [4]:
# prompt: remove Timestamp column

amlms = amlms.drop('Timestamp', axis=1)

In [5]:
#Count the number of samples in each class
class_counts = amlms['Is Laundering'].value_counts()

#Print the class counts
print(class_counts)

Is Laundering
0    5073168
1       5177
Name: count, dtype: int64


In [6]:
from sklearn.preprocessing import LabelEncoder
# Define the columns to be label encoded
categorical_columns = ['Account', 'Account.1', 'Receiving Currency', 'Payment Currency', 'Payment Format']

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Encode the categorical columns
for column in categorical_columns:
  amlms[column] = label_encoder.fit_transform(amlms[column])

In [7]:
amlms

Unnamed: 0,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,10,942,10,900,3697.340000,12,3697.340000,12,5,0
1,3208,990,1,949,0.010000,12,0.010000,12,3,0
2,3209,991,3209,940,14675.570000,12,14675.570000,12,5,0
3,12,997,12,945,2806.970000,12,2806.970000,12,5,0
4,10,999,10,947,36682.970000,12,36682.970000,12,5,0
...,...,...,...,...,...,...,...,...,...,...
5078340,54219,495979,256398,419726,0.154978,1,0.154978,1,1,0
5078341,15,495988,256398,419726,0.108128,1,0.108128,1,1,0
5078342,154365,495980,256398,419726,0.004988,1,0.004988,1,1,0
5078343,256398,495978,256398,419726,0.038417,1,0.038417,1,1,0


In [8]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.2


In [9]:
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [10]:
#Split data into training and validation sets - target class is "Is Laundering"

X = amlms.drop('Is Laundering', axis=1)
y = amlms['Is Laundering']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=22, stratify=y)

smote = SMOTE(random_state=42)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

print(f"Number of training samples: {len(X_train)}")
print(f"Number of validation samples: {len(X_val)}")
print(f"Number of new training samples: {len(X_train_over)}")
print(f"Number of new training samples: {len(y_train_over)}")

Number of training samples: 4062676
Number of validation samples: 1015669
Number of new training samples: 8117068
Number of new training samples: 8117068


In [11]:
print(X_train_over.shape)

(8117068, 9)


In [12]:
print(y_train_over.shape)

(8117068,)


In [13]:
num_classes = len(class_counts)
num_classes

2

In [14]:
def calculate_reshape_value(data_shape, lstm_units):
  """
  Calculates the new number of samples required for reshaping to fit LSTM layers.

  Args:
      original_samples (int): The original number of samples in the data.
      original_features (int): The original number of features in the data (excluding padding).
      lstm_units (int): The number of units in the first LSTM layer.
      target_features (int): The number of features per timestep (embedding size).

  Returns:
      int: The new number of samples for the reshaped data.
  """
  samples, features = data_shape
  # Ensure enough samples after reshape to accommodate LSTM units
  new_samples = samples // lstm_units
  return new_samples



In [15]:
calculate_reshape_value(X_train_over.shape, 64)

126829

In [18]:
# prompt: based on the above cells write me a LSTM Model and then compile the model an fit as well to check for accuracy

# Define the LSTM model
model = keras.Sequential([
    keras.layers.LSTM(64, return_sequences=True, input_shape=(X_train_over.shape[1], 1)),
    keras.layers.LSTM(32),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(num_classes, activation="softmax"),
])

# Compile the model
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

# Fit the model
history = model.fit(X_train_over, y_train_over, epochs=5, validation_data=(X_val, y_val))

# Evaluate the model on the validation data
loss, accuracy = model.evaluate(X_val, y_val)
print("Validation loss:", loss)
print("Validation accuracy:", accuracy)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation loss: 0.1944616436958313
Validation accuracy: 0.918109118938446


In [None]:
print(X_train_over.shape)

(1598278, 9)


In [None]:
print(y_train_over.shape)

(1598278,)


In [19]:
from sklearn.metrics import f1_score, recall_score, precision_score

y_pred = model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)

f1 = f1_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)

print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")

F1 Score: 0.019521395732641753
Recall: 0.8
Precision: 0.009881257831612865


In [20]:
print(X_train_over.shape)

(8117068, 9)


In [21]:
print(y_train_over.shape)

(8117068,)
