<a href="https://colab.research.google.com/github/adc257/AmEx-Project/blob/Ye_branch/LSTM_implementation_3_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM Implementation

In [52]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import TextClassificationPipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from tqdm import tqdm
from torch.autograd import Variable
import math

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data

In [54]:
# Import train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Separate train labels and text
train_labels = train['category']
train_text = train['text']
train_labels_list = train_labels.tolist()

# Separate test labels and text
test_labels = test['category']
test_text = test['text']
test_labels_list = test_labels.tolist()

### Set 70% of the train dataset to be the train set, stratifying by the train labels

In [55]:
# Create a stratified 70% sample of the train data
train_text, test_text, train_labels, test_labels = train_test_split(train_text, train_labels, test_size=0.30, stratify=train_labels, random_state=42)

# Reset the indices of the train_labels and test_labels Series as well as the train_text and test_text
train_labels = train_labels.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)
train_text = train_text.reset_index(drop=True)
test_text = test_text.reset_index(drop=True)

In [56]:
test_labels.value_counts()

card_payment_fee_charged                            56
direct_debit_payment_not_recognised                 55
wrong_amount_of_cash_received                       54
balance_not_updated_after_cheque_or_cash_deposit    54
cash_withdrawal_charge                              53
                                                    ..
lost_or_stolen_card                                 25
card_acceptance                                     18
card_swallowed                                      18
virtual_card_not_working                            12
contactless_not_working                             10
Name: category, Length: 77, dtype: int64

### Shuffle 3% of Labels in Dn to Create Noise

In [57]:
def add_noise(labels, noise_level=0.03, seed=42):
    # Set the seed for the random number generator
    np.random.seed(seed)

    # Get unique labels and their counts
    unique_labels, counts = np.unique(labels, return_counts=True)

    # Calculate the number of labels to change for each unique label
    n_noise_per_label = (noise_level * counts).astype(int)

    # Create a copy of labels to work with
    noisy_labels = labels.copy()

    # Initialize an empty list to store noise indices
    noise_indices = []

    for lbl, n_noise in zip(unique_labels, n_noise_per_label):
      if n_noise == 0:
        n_noise = 1

      # Get indices of current label
      label_indices = np.where(labels == lbl)[0]

      # Randomly select labels of current label to change
      noise_indices_lbl = np.random.choice(label_indices, size=n_noise, replace=False)

      # Add the noise indices for this label to the main list
      noise_indices.extend(noise_indices_lbl)

      # Get a list of labels excluding the current one
      other_labels = [other_lbl for other_lbl in unique_labels if other_lbl != lbl]

      for idx in noise_indices_lbl:
          # Randomly select a new label
          new_label = np.random.choice(other_labels)

          # Replace the label at the current index with the new label
          noisy_labels[idx] = new_label

    return noisy_labels, np.array(noise_indices)

In [58]:
# Add noise to train and test labels and get the indices of the noised labels
full_noised_train_labels, train_noise_indices = add_noise(train_labels)
full_noised_test_labels, test_noise_indices = add_noise(test_labels)

In [59]:
full_noised_test_labels.value_counts()

wrong_amount_of_cash_received                       56
direct_debit_payment_not_recognised                 56
card_payment_fee_charged                            56
balance_not_updated_after_cheque_or_cash_deposit    55
transaction_charged_twice                           54
                                                    ..
compromised_card                                    25
card_acceptance                                     19
card_swallowed                                      19
virtual_card_not_working                            11
contactless_not_working                             10
Name: category, Length: 77, dtype: int64

In [60]:
# Create a DataFrame from the train_text and train_labels
df_train = pd.DataFrame({
    'text': train_text,
    'category': full_noised_train_labels,
    'k': 0,
    'l': train_labels,
    'l_prime': full_noised_train_labels
})

# Mark the noised samples in the 'k' column
df_train.loc[train_noise_indices, 'k'] = 1

# Repeat the same process for the test set
df_test = pd.DataFrame({
    'text': test_text,
    'category': full_noised_test_labels,
    'k': 0,
    'l': test_labels,
    'l_prime': full_noised_test_labels
})

df_test.loc[test_noise_indices, 'k'] = 1

# Combine the train and test DataFrames
train_test_combined_df = pd.concat([df_train, df_test])

# Display the DataFrame to verify the changes
train_test_combined_df.head(25)

Unnamed: 0,text,category,k,l,l_prime
0,What do I do if I detect fraudulent use on my ...,compromised_card,0,compromised_card,compromised_card
1,Ive been trying to use my card for two weeks n...,reverted_card_payment?,0,reverted_card_payment?,reverted_card_payment?
2,Why did the person I transferred money to not ...,transfer_fee_charged,0,transfer_fee_charged,transfer_fee_charged
3,Is there a maximum for top-ups?,top_up_limits,0,top_up_limits,top_up_limits
4,"I desperately need to top-up my card, so why i...",top_up_failed,0,top_up_failed,top_up_failed
5,Can I order a card please?,order_physical_card,0,order_physical_card,order_physical_card
6,The machine only paid me a hundred when I want...,wrong_amount_of_cash_received,0,wrong_amount_of_cash_received,wrong_amount_of_cash_received
7,How do I add the card I just received to show ...,card_linking,0,card_linking,card_linking
8,How long does it take for money to transfer? ...,transfer_not_received_by_recipient,0,transfer_not_received_by_recipient,transfer_not_received_by_recipient
9,I tried to make a transfer but it failed. will...,failed_transfer,0,failed_transfer,failed_transfer


## Step 1: Define and Train Deep Model

In [61]:
device_name = 'cuda'
model_name = 'bert-base-uncased'

from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split, DataLoader

# Tokenize input text
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize and encode the training text
train_input_text = np.array(train_text)
train_input_ids = tokenizer.batch_encode_plus(train_input_text, padding=True, truncation=True, return_tensors='pt')['input_ids']
train_attention_mask = tokenizer.batch_encode_plus(train_input_text, padding=True, truncation=True, return_tensors='pt')['attention_mask']

# Tokenize and encode the testing text
test_input_text = np.array(test_text)
test_input_ids = tokenizer.batch_encode_plus(test_input_text, padding=True, truncation=True, return_tensors='pt')['input_ids']
test_attention_mask = tokenizer.batch_encode_plus(test_input_text, padding=True, truncation=True, return_tensors='pt')['attention_mask']

# Encode labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(full_noised_train_labels)
test_labels_encoded = label_encoder.transform(full_noised_test_labels)  # Use transform for test labels

# Convert encoded labels to tensors
train_labels = torch.tensor(train_labels_encoded)
test_labels = torch.tensor(test_labels_encoded)

# Get the indices of the noised data
noise_indices = test_noise_indices

# Convert the indices to a tensor
noise_indices = torch.tensor(noise_indices)

# Create boolean masks for the noised and non-noised data
mask_noised = torch.zeros(test_labels.size(0), dtype=torch.bool)
mask_noised[noise_indices] = True
mask_non_noised = ~mask_noised

# Create TensorDatasets for the noised and non-noised parts of the test set
test_dataset_noised = TensorDataset(test_input_ids[mask_noised], test_attention_mask[mask_noised], test_labels[mask_noised])
test_dataset_non_noised = TensorDataset(test_input_ids[mask_non_noised], test_attention_mask[mask_non_noised], test_labels[mask_non_noised])

# Create TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

In [62]:
# from safetensors.torch import load_file
# from transformers import BertModel
# file_path = "/content/drive/MyDrive/Zeta Test/model.safetensors"
# loaded = load_file(file_path)

# model = BertModel.from_pretrained(loaded)

In [63]:
# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in tqdm(dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
            total_loss += loss.item()

            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
            preds = torch.argmax(probs, dim=1)  # Get predictions from probabilities

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(dataloader), all_preds, all_labels

# LSTM Model Training and Evaluation for Text Classification with PyTorch

In [64]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        out, _ = self.lstm(embedded)
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = len(tokenizer.get_vocab())
hidden_size = 128
num_layers = 2
label_encoder = LabelEncoder()
num_classes = len(label_encoder.fit(full_noised_train_labels).classes_)

# Initialize the LSTM model
lstm_model = LSTMModel(input_size, hidden_size, num_layers, num_classes).to(device_name)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Create DataLoader for training and testing
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Training the LSTM model
epochs = 5
for epoch in range(epochs):
    lstm_model.train()
    total_loss = 0.0
    for inputs, masks, labels in train_loader:
        inputs, masks, labels = inputs.to(device_name), masks.to(device_name), labels.to(device_name)
        optimizer.zero_grad()
        outputs = lstm_model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}")

# Evaluation on the test set
lstm_model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for inputs, masks, labels in test_loader:
        inputs, masks, labels = inputs.to(device_name), masks.to(device_name), labels.to(device_name)
        outputs = lstm_model(inputs)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy and F1 score
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='macro')
print(f"Test Accuracy: {accuracy}, F1 Score: {f1}")


Epoch 1/5, Loss: 4.330410556706119
Epoch 2/5, Loss: 4.320622322221869
Epoch 3/5, Loss: 4.319946746303611
Epoch 4/5, Loss: 4.3187909344015605
Epoch 5/5, Loss: 4.317195134620144
Test Accuracy: 0.01866044651782739, F1 Score: 0.0004759638267491671


# Evaluate Model Performance on Different Data Subsets

In [65]:
from sklearn.metrics import accuracy_score, f1_score

# Define a function to evaluate model performance on a given subset of the test set
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, masks, labels in dataloader:
            inputs, masks, labels = inputs.to(device_name), masks.to(device_name), labels.to(device_name)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return accuracy, f1

# Evaluate the LSTM model on different subsets of the test set
# 1. Entire test set
accuracy_entire_test, f1_entire_test = evaluate_model(lstm_model, test_loader)

# 2. Noised part of the test set
noised_test_loader = DataLoader(test_dataset_noised, batch_size=batch_size)
accuracy_noised_test, f1_noised_test = evaluate_model(lstm_model, noised_test_loader)

# 3. Non-noised part of the test set
non_noised_test_loader = DataLoader(test_dataset_non_noised, batch_size=batch_size)
accuracy_non_noised_test, f1_non_noised_test = evaluate_model(lstm_model, non_noised_test_loader)

# Print evaluation metrics for each subset
print("Performance on Entire Test Set:")
print(f"Accuracy: {accuracy_entire_test}, F1 Score: {f1_entire_test}")
print("Performance on Noised Part of Test Set:")
print(f"Accuracy: {accuracy_noised_test}, F1 Score: {f1_noised_test}")
print("Performance on Non-noised Part of Test Set:")
print(f"Accuracy: {accuracy_non_noised_test}, F1 Score: {f1_non_noised_test}")


Performance on Entire Test Set:
Accuracy: 0.01866044651782739, F1 Score: 0.0004759638267491671
Performance on Noised Part of Test Set:
Accuracy: 0.012987012987012988, F1 Score: 0.000523286237571952
Performance on Non-noised Part of Test Set:
Accuracy: 0.01880984952120383, F1 Score: 0.00047970833733090287


# Set Batch Size

In [71]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Define the batch size
batch_size = 16

# Assuming you have defined your LSTM model and DataLoader train_loader
# Create DataLoader for training data with specified batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
model.to(device)

# Modify the training loop to handle SequenceClassifierOutput
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, masks, labels = data
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        logits = outputs.logits  # Extract logits from SequenceClassifierOutput

        loss = criterion(logits, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # Print every 100 mini-batches
            print(f'Epoch {epoch+1}, Batch {i+1}, Loss: {running_loss / 100}')
            running_loss = 0.0

print('Finished Training')



Epoch 1, Batch 100, Loss: 4.380464692115783
Epoch 1, Batch 200, Loss: 4.362850432395935
Epoch 1, Batch 300, Loss: 4.3615529298782345
Epoch 1, Batch 400, Loss: 4.353832459449768
Epoch 2, Batch 100, Loss: 4.345209832191467
Epoch 2, Batch 200, Loss: 4.348524522781372
Epoch 2, Batch 300, Loss: 4.346203355789185
Epoch 2, Batch 400, Loss: 4.358211913108826
Epoch 3, Batch 100, Loss: 4.341966633796692
Epoch 3, Batch 200, Loss: 4.33227958202362
Epoch 3, Batch 300, Loss: 4.349987154006958
Epoch 3, Batch 400, Loss: 4.349074077606201
Epoch 4, Batch 100, Loss: 4.3483943319320675
Epoch 4, Batch 200, Loss: 4.3441070938110355
Epoch 4, Batch 300, Loss: 4.332904725074768
Epoch 4, Batch 400, Loss: 4.330227727890015
Epoch 5, Batch 100, Loss: 4.344835395812988
Epoch 5, Batch 200, Loss: 4.331434617042541
Epoch 5, Batch 300, Loss: 4.333095369338989
Epoch 5, Batch 400, Loss: 4.345867395401001
Epoch 6, Batch 100, Loss: 4.338492779731751
Epoch 6, Batch 200, Loss: 4.333756799697876
Epoch 6, Batch 300, Loss: 4.33