<a href="https://colab.research.google.com/github/adc257/AmEx-Project/blob/Ye_branch/LSTM_implementation_Noise_De.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM Implementation

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import TextClassificationPipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, Trainer, TrainingArguments
import torch
import torch.nn as nn
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Data

In [4]:
# Import train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Separate train labels and text
train_labels = train['category']
train_text = train['text']
train_labels_list = train_labels.tolist()

# Separate test labels and text
test_labels = test['category']
test_text = test['text']
test_labels_list = test_labels.tolist()

### Randomly select 15% of the dataset to be noised

In [5]:
# Set the random seed for reproducibility
np.random.seed(42)

# Calculate the number of samples for 15% of the dataset
sample_size = int(len(train) * 0.15)

# Conduct sampling
sampled_data = train.sample(n=sample_size, replace=False)

sampled_data

Unnamed: 0,text,category
6883,Is it possible for me to change my PIN number?,change_pin
5836,I'm not sure why my card didn't work,declined_card_payment
8601,I don't think my top up worked,top_up_failed
2545,Can you explain why my payment was charged a fee?,card_payment_fee_charged
8697,How long does a transfer from a UK account tak...,balance_not_updated_after_bank_transfer
...,...,...
9001,Is there a charge for exchanging foreign curre...,exchange_charge
6840,Please tell me why the purchase I made online ...,reverted_card_payment?
967,Are you able to make exchanges to EUR?,fiat_currency_support
6463,How do a reverse a duplicated charge?,transaction_charged_twice


### Shuffle 10% of Labels in Dn to Create Noise

In [6]:
# Randomly sample 10% of the rows from the Dn
sampled_rows = sampled_data.sample(frac=0.1, random_state=42)

# Add new column 'k' and initialize with 0
sampled_data['k'] = 0

# Store the original labels ('l') before shuffling the 'category' column
sampled_data['l'] = sampled_data['category']  # Initialize 'l' column with original category values
sampled_data.loc[sampled_rows.index, 'l'] = sampled_data.loc[sampled_rows.index, 'category']

# Convert the selected rows' 'category' column to a numpy array
sampled_category_values = np.array(sampled_rows['category'])

# Shuffle the array
np.random.shuffle(sampled_category_values)

# Update the 'category' column with the shuffled values in the original DataFrame
sampled_data.loc[sampled_rows.index, 'category'] = sampled_category_values

# Update 'k' to 1 for the randomly selected 10% of rows
sampled_data.loc[sampled_rows.index, 'k'] = 1

# Update 'l\'' for the shuffled rows
sampled_data.loc[sampled_rows.index, 'l\''] = sampled_category_values

# Display the DataFrame to verify the changes
sampled_data.head(25)


Unnamed: 0,text,category,k,l,l'
6883,Is it possible for me to change my PIN number?,change_pin,0,change_pin,
5836,I'm not sure why my card didn't work,declined_card_payment,0,declined_card_payment,
8601,I don't think my top up worked,top_up_failed,0,top_up_failed,
2545,Can you explain why my payment was charged a fee?,card_payment_fee_charged,0,card_payment_fee_charged,
8697,How long does a transfer from a UK account tak...,balance_not_updated_after_bank_transfer,0,balance_not_updated_after_bank_transfer,
5573,Why am I getting declines when trying to make ...,declined_transfer,0,declined_transfer,
576,What is the $1 transaction on my account?,extra_charge_on_statement,0,extra_charge_on_statement,
6832,It looks like my card payment was sent back.,reverted_card_payment?,0,reverted_card_payment?,
7111,Why am I unable to transfer money when I was a...,beneficiary_not_allowed,0,beneficiary_not_allowed,
439,What if there is an error on the exchange rate?,card_payment_wrong_exchange_rate,0,card_payment_wrong_exchange_rate,


## Step 1: Define and Train Deep Model

In [7]:
device_name = 'cuda'
model_name = 'bert-base-uncased'

from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, random_split, DataLoader

# Tokenize input text
tokenizer = BertTokenizer.from_pretrained(model_name)
input_text = np.array(sampled_data['text'])
input_ids = tokenizer.batch_encode_plus(input_text, padding=True, truncation=True, return_tensors='pt')['input_ids']
attention_mask = tokenizer.batch_encode_plus(input_text, padding=True, truncation=True, return_tensors='pt')['attention_mask']

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(sampled_data['category'])
labels_tensor = torch.tensor(labels_encoded)

# Create TensorDataset
train_dataset = TensorDataset(input_ids, attention_mask, labels_tensor)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Load pre-trained model
device_name = 'cpu'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 40
batch_size = 15

# DataLoader for training set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# List to store probabilities per epoch
probabilities_per_epoch = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        epoch_probs.append(probs.detach().cpu().numpy())  # Append probabilities to the list for this epoch

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Calculate F1 score
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Append probabilities for this epoch to the list
    probabilities_per_epoch.append(epoch_probs)

# Convert probabilities_per_epoch to a NumPy array for easier manipulation
probabilities_array = np.array(probabilities_per_epoch)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 100/100 [15:44<00:00,  9.45s/it]


Epoch 1, Loss: 434.9764475822449, F1 Score: 0.008134866629462462


Epoch 2: 100%|██████████| 100/100 [15:17<00:00,  9.17s/it]


Epoch 2, Loss: 417.0246248245239, F1 Score: 0.02698240065698928


Epoch 3: 100%|██████████| 100/100 [15:08<00:00,  9.08s/it]


Epoch 3, Loss: 389.60510635375977, F1 Score: 0.10084345564971323


Epoch 4: 100%|██████████| 100/100 [15:08<00:00,  9.09s/it]


Epoch 4, Loss: 361.9026737213135, F1 Score: 0.22203781651029875


Epoch 5:   4%|▍         | 4/100 [00:36<14:19,  8.95s/it]

In [None]:
probabilities_array.shape

In [None]:
# Assuming 'probabilities' is your array of shape (40, 100, 15, 77)
# and 'df' is your existing DataFrame with 1500 rows

# Reshape your probabilities to have shape (1500, 40, 77)
reshaped_probabilities = probabilities_array.reshape(-1, 40, 77)

# Convert your reshaped probabilities to a DataFrame
probabilities_df = pd.DataFrame(reshaped_probabilities.tolist(), columns=[f'Epoch_{i+1}' for i in range(40)])

# Reset the index of your existing DataFrame
df_reset = sampled_data.reset_index(drop=True)

# Join your existing DataFrame with your new DataFrame, ignoring the index
df = df_reset.join(probabilities_df)
df

# Other code that might not work below


---



In [None]:
print(probabilities_array.shape)
max(probabilities_array[0][40][0])

In [None]:
max_probs = []
for epoch in probabilities_array:
  for batch in epoch:
    for sample in batch:
      max_probs.append(max(sample))

len(max_probs)

In [None]:
# Assuming probabilities_array has shape (num_epochs, batch_size, num_samples, num_classes)
num_epochs, batch_size, num_samples, num_classes = probabilities_array.shape

# Reshape probabilities_array to have separate dimensions for epochs, samples, and classes
consolidated_probabilities = probabilities_array.reshape(num_epochs, -1, num_classes)

consolidated_probabilities.shape

# Calculate the maximum probability for each epoch
max_probs_per_epoch = np.max(consolidated_probabilities, axis=(1, 2))

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(max_probs_per_epoch, color='blue', label='Max Probability per Epoch')
plt.title('Max Probabilities over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Max Probability')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
epoch_probs[0].shape
len(probabilities_per_epoch[0])

In [None]:
# List to store probabilities of true label per epoch
true_label_probs_per_epoch = []

for epoch_probs in probabilities_per_epoch:
    true_label_probs_batch = []
    for batch_probs, batch_labels in zip(epoch_probs, all_labels):
        true_label_prob = batch_probs[batch_labels].tolist()  # Probability corresponding to the true label
        true_label_probs_batch.extend(true_label_prob)
    true_label_probs_per_epoch.append(true_label_probs_batch)

# Convert true_label_probs_per_epoch to a NumPy array for easier manipulation
true_label_probs_array = np.array(true_label_probs_per_epoch)

# Calculate mean probability of true label per epoch
mean_true_label_probs_per_epoch = np.mean(true_label_probs_array, axis=1)

# Calculate standard deviation of probability of true label per epoch
std_true_label_probs_per_epoch = np.std(true_label_probs_array, axis=1)

print("Mean probability assigned to the actual true label per epoch:")
print(mean_true_label_probs_per_epoch)

print("Standard deviation of probability assigned to the actual true label per epoch:")
print(std_true_label_probs_per_epoch)


In [None]:
# List to store probabilities of true label per epoch
true_label_probs_per_epoch = []

for epoch_probs in probabilities_per_epoch:
    true_label_probs = []
    for i, prob in enumerate(epoch_probs):
        true_label_index = all_labels[prob]  # Get the true label index
        true_label_prob = prob[true_label_index].item()  # Probability corresponding to the true label
        true_label_probs.append(true_label_prob)
    true_label_probs_per_epoch.append(true_label_probs)

# Convert true_label_probs_per_epoch to a NumPy array for easier manipulation
true_label_probs_array = np.array(true_label_probs_per_epoch)

# Calculate mean probability of true label per epoch
mean_true_label_probs_per_epoch = np.mean(true_label_probs_array, axis=1)

# Calculate standard deviation of probability of true label per epoch
std_true_label_probs_per_epoch = np.std(true_label_probs_array, axis=1)

print("Mean probability assigned to the actual true label per epoch:")
print(mean_true_label_probs_per_epoch)

print("Standard deviation of probability assigned to the actual true label per epoch:")
print(std_true_label_probs_per_epoch)


In [None]:
# Save the trained model
model.save_pretrained("/content/drive/MyDrive/Zeta Test")

In [None]:
# Old model with validation set

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=77).to(device_name)

# Define optimizer with Adam
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

# Define loss function (cross-entropy loss for multi-class classification)
criterion = nn.CrossEntropyLoss()

# Define training parameters
num_epochs = 100
batch_size = 16

# Split dataset into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# DataLoader for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# List to store probabilities per epoch
probabilities_per_epoch = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    epoch_probs = []  # List to store probabilities for this epoch

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)  # Calculate cross-entropy loss
        total_loss += loss.item()

        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)  # Calculate probabilities from logits
        epoch_probs.append(probs.detach().cpu().numpy())  # Append probabilities to the list for this epoch

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    all_preds_val = []
    all_labels_val = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device_name), attention_mask.to(device_name), labels.to(device_name)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds_val.extend(preds.cpu().tolist())
            all_labels_val.extend(labels.cpu().tolist())

    # Calculate F1 score
    f1 = f1_score(all_labels_val, all_preds_val, average='macro')

    print(f"Epoch {epoch+1}, Loss: {total_loss}, F1 Score: {f1}")

    # Append probabilities for this epoch to the list
    probabilities_per_epoch.append(epoch_probs)

# Convert probabilities_per_epoch to a NumPy array for easier manipulation
probabilities_array = np.array(probabilities_per_epoch)

 Define the Noise Detector Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

class NoiseDetector(nn.Module):
    def __init__(self, input_dim):
        super(NoiseDetector, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.layer2(x)
        x = self.sigmoid(x)
        return x


Prepare the Dataset

In [None]:
# Convert your softmax probabilities and noise labels into tensors
softmax_probabilities_tensor = torch.tensor(softmax_probabilities, dtype=torch.float32)
noise_labels_tensor = torch.tensor(noise_labels, dtype=torch.float32)

# Create a TensorDataset and DataLoader
dataset = TensorDataset(softmax_probabilities_tensor, noise_labels_tensor.view(-1, 1))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


Train the Noise Detector

In [None]:
# Initialize the model and move it to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NoiseDetector(input_dim=softmax_probabilities.shape[1]).to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}')
