# Glove Embedding

In [1]:
from gensim.models import KeyedVectors
import json
import numpy as np
import torch
from sklearn.metrics import f1_score
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

In [2]:
label_map = {'B', 'I', 'O'}

In [3]:
glove_model_path = '../glove.42B.300d.txt'

# Load GloVe word embeddings
def load_glove_model(glove_model_path):
    print("Loading GloVe word embeddings...")
    with open(glove_model_path, 'r', encoding='utf-8') as f:
        embeddings_index = {}
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index

word2vec_model = load_glove_model(glove_model_path)

with open('ATE_train.json', 'r') as file:
    dataset = json.load(file)

print(len(dataset))

max_seq_len = 85
word_embeddings = []
labels = []

for key, value in dataset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text] 
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)
print(word_embeddings_array.shape)
texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
labels_tensor = torch.tensor(labels)

Loading GloVe word embeddings...
Found 1917494 word vectors.
906
(906, 85, 300)


In [4]:
with open('ATE_val.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

219


In [None]:
# import wandb
# wandb.login()

In [None]:
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"RNN_Glove"
# )

In [5]:
class RNNTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNTagger, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

input_size = 300
hidden_size = 256
output_size = 100

model = RNNTagger(input_size, hidden_size, output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 45
batch_size = 32 

texts_tensor = texts_tensor.to(device)
labels_tensor = labels_tensor.to(device)
val_texts_tensor = test_texts_tensor
val_labels_tensor =  test_labels_tensor
val_texts_tensor = val_texts_tensor.to(device)
val_labels_tensor = val_labels_tensor.to(device)

save_loss = 0

for epoch in range(num_epochs):
    
    model.train()  
    total_loss=0
    train_predictions = []
    train_labels = []
    for i in range(0, len(texts_tensor), batch_size):
        optimizer.zero_grad()
        batch_texts = texts_tensor[i:i+batch_size]
        batch_labels = labels_tensor[i:i+batch_size].view(-1)
        outputs = model(batch_texts)
        loss = criterion(outputs.view(-1, output_size), batch_labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_predictions.extend(torch.argmax(outputs, dim=2).flatten().cpu().tolist())
        train_labels.extend(batch_labels.cpu().tolist())
        
    train_f1 = f1_score(train_labels, train_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(texts_tensor):.4f}, Training F1 Score: {train_f1:.4f}')
    
    # Validation
    model.eval()  
    total_val_loss = 0
    val_predictions = []
    val_labels = []
    
    with torch.no_grad():
        for i in range(0, len(val_texts_tensor), batch_size):
            batch_texts_val = val_texts_tensor[i:i+batch_size]
            batch_labels_val = val_labels_tensor[i:i+batch_size].view(-1)
            outputs_val = model(batch_texts_val)
            val_loss = criterion(outputs_val.view(-1, output_size), batch_labels_val)
            total_val_loss += val_loss.item()
            val_predictions.extend(torch.argmax(outputs_val, dim=2).flatten().cpu().tolist())
            val_labels.extend(batch_labels_val.cpu().tolist())
    
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {total_val_loss/len(val_texts_tensor):.4f}, Validation F1 Score: {val_f1:.4f}')
    if(val_f1>save_loss):
        save_loss = val_f1
        torch.save(model.state_dict(),'t2_rnn_glove.pt')
    log_metric = {"Epoch": epoch+1, "Training Loss": total_loss/len(texts_tensor), "Training F1 Score": train_f1, "Validation Loss": total_val_loss/len(val_texts_tensor), "Validation F1 Score": val_f1}
    # wandb.log(log_metric)
print("Finished Training")
    

  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/45], Training Loss: 0.0374, Training F1 Score: 0.0360
Epoch [1/45], Validation Loss: 0.0050, Validation F1 Score: 0.6693
Epoch [2/45], Training Loss: 0.0045, Training F1 Score: 0.6624
Epoch [2/45], Validation Loss: 0.0040, Validation F1 Score: 0.6889
Epoch [3/45], Training Loss: 0.0038, Training F1 Score: 0.7012
Epoch [3/45], Validation Loss: 0.0035, Validation F1 Score: 0.7610
Epoch [4/45], Training Loss: 0.0035, Training F1 Score: 0.7798
Epoch [4/45], Validation Loss: 0.0033, Validation F1 Score: 0.8026
Epoch [5/45], Training Loss: 0.0033, Training F1 Score: 0.8109
Epoch [5/45], Validation Loss: 0.0031, Validation F1 Score: 0.8261
Epoch [6/45], Training Loss: 0.0031, Training F1 Score: 0.8269
Epoch [6/45], Validation Loss: 0.0030, Validation F1 Score: 0.8357
Epoch [7/45], Training Loss: 0.0029, Training F1 Score: 0.8410
Epoch [7/45], Validation Loss: 0.0028, Validation F1 Score: 0.8431
Epoch [8/45], Training Loss: 0.0028, Training F1 Score: 0.8500
Epoch [8/45], Validation Lo

In [6]:
with open('ATE_test.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

with torch.no_grad():
    model = RNNTagger(input_size, hidden_size, output_size)
    model = model.to(device)
    model.load_state_dict(torch.load('t2_rnn_glove.pt'))
    test_outputs = model(test_texts_tensor.to(device)) 
    argmax_indices = torch.argmax(test_outputs, dim=2)
    reshaped_tensor = argmax_indices.view(328, 85)
    fl_out = reshaped_tensor.flatten()
    fl_label = test_labels_tensor.flatten().to(device) 
    f1 = f1_score(fl_out.cpu(), fl_label.cpu(), average='macro') 
    print("F1 Score on Test Set:", f1)

328
F1 Score on Test Set: 0.8667638555194283


In [None]:
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"LSTM_Glove"
# )

In [7]:
class LSTMTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMTagger, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out

input_size = 300
hidden_size = 256
output_size = 100

model = LSTMTagger(input_size, hidden_size, output_size) # Move model to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 45
batch_size = 32 

texts_tensor = texts_tensor.to(device)
labels_tensor = labels_tensor.to(device)
val_texts_tensor = test_texts_tensor
val_labels_tensor =  test_labels_tensor
val_texts_tensor = val_texts_tensor.to(device)
val_labels_tensor = val_labels_tensor.to(device)

save_loss = 0

for epoch in range(num_epochs):
    
    model.train()  
    total_loss=0
    train_predictions = []
    train_labels = []
    for i in range(0, len(texts_tensor), batch_size):
        optimizer.zero_grad()
        batch_texts = texts_tensor[i:i+batch_size]
        batch_labels = labels_tensor[i:i+batch_size].view(-1)
        outputs = model(batch_texts)
        loss = criterion(outputs.view(-1, output_size), batch_labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_predictions.extend(torch.argmax(outputs, dim=2).flatten().cpu().tolist())
        train_labels.extend(batch_labels.cpu().tolist())
        
    train_f1 = f1_score(train_labels, train_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(texts_tensor):.4f}, Training F1 Score: {train_f1:.4f}')
    
    # Validation
    model.eval()  
    total_val_loss = 0
    val_predictions = []
    val_labels = []
    
    with torch.no_grad():
        for i in range(0, len(val_texts_tensor), batch_size):
            batch_texts_val = val_texts_tensor[i:i+batch_size]
            batch_labels_val = val_labels_tensor[i:i+batch_size].view(-1)
            outputs_val = model(batch_texts_val)
            val_loss = criterion(outputs_val.view(-1, output_size), batch_labels_val)
            total_val_loss += val_loss.item()
            val_predictions.extend(torch.argmax(outputs_val, dim=2).flatten().cpu().tolist())
            val_labels.extend(batch_labels_val.cpu().tolist())
    
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {total_val_loss/len(val_texts_tensor):.4f}, Validation F1 Score: {val_f1:.4f}')
    if(val_f1>save_loss):
        save_loss = val_f1
        torch.save(model.state_dict(),'t2_lstm_glove.pt')
    log_metric = {"Epoch": epoch+1, "Training Loss": total_loss/len(texts_tensor), "Training F1 Score": train_f1, "Validation Loss": total_val_loss/len(val_texts_tensor), "Validation F1 Score": val_f1}
    # wandb.log(log_metric)
print("Finished Training")
    

Epoch [1/45], Training Loss: 0.0568, Training F1 Score: 0.0572
Epoch [1/45], Validation Loss: 0.0065, Validation F1 Score: 0.6154
Epoch [2/45], Training Loss: 0.0056, Training F1 Score: 0.6347
Epoch [2/45], Validation Loss: 0.0049, Validation F1 Score: 0.6488
Epoch [3/45], Training Loss: 0.0046, Training F1 Score: 0.6670
Epoch [3/45], Validation Loss: 0.0043, Validation F1 Score: 0.6670
Epoch [4/45], Training Loss: 0.0041, Training F1 Score: 0.7245
Epoch [4/45], Validation Loss: 0.0039, Validation F1 Score: 0.7395
Epoch [5/45], Training Loss: 0.0038, Training F1 Score: 0.7780
Epoch [5/45], Validation Loss: 0.0037, Validation F1 Score: 0.7748
Epoch [6/45], Training Loss: 0.0035, Training F1 Score: 0.8055
Epoch [6/45], Validation Loss: 0.0036, Validation F1 Score: 0.8080
Epoch [7/45], Training Loss: 0.0034, Training F1 Score: 0.8308
Epoch [7/45], Validation Loss: 0.0034, Validation F1 Score: 0.8287
Epoch [8/45], Training Loss: 0.0033, Training F1 Score: 0.8453
Epoch [8/45], Validation Lo

In [8]:
with open('ATE_test.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

with torch.no_grad():
    model = LSTMTagger(input_size, hidden_size, output_size)
    model = model.to(device)
    model.load_state_dict(torch.load('t2_lstm_glove.pt'))
    test_outputs = model(test_texts_tensor.to(device)) 
    argmax_indices = torch.argmax(test_outputs, dim=2)
    reshaped_tensor = argmax_indices.view(328, 85)
    fl_out = reshaped_tensor.flatten()
    fl_label = test_labels_tensor.flatten().to(device) 
    f1 = f1_score(fl_out.cpu(), fl_label.cpu(), average='macro') 
    print("F1 Score on Test Set:", f1)

328
F1 Score on Test Set: 0.8730141134195999


In [None]:
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"GRU_Glove"
# )

In [9]:
class GRUTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUTagger, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.gru1 = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.5) 

    def forward(self, x):
        out, _ = self.gru(x)
        out = torch.relu(out)
        out, _ = self.gru1(out)
        out = torch.relu(out)
        out = self.dropout(out)  
        out = self.fc1(out)
        return out
    
input_size = 300
hidden_size = 256
output_size = 27

model = GRUTagger(input_size, hidden_size, output_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 45
batch_size = 32 

texts_tensor = texts_tensor.to(device)
labels_tensor = labels_tensor.to(device)
val_texts_tensor = test_texts_tensor
val_labels_tensor =  test_labels_tensor
val_texts_tensor = val_texts_tensor.to(device)
val_labels_tensor = val_labels_tensor.to(device)

save_loss = 0

for epoch in range(num_epochs):
    
    model.train()  
    total_loss=0
    train_predictions = []
    train_labels = []
    for i in range(0, len(texts_tensor), batch_size):
        optimizer.zero_grad()
        batch_texts = texts_tensor[i:i+batch_size]
        batch_labels = labels_tensor[i:i+batch_size].view(-1)
        outputs = model(batch_texts)
        loss = criterion(outputs.view(-1, output_size), batch_labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_predictions.extend(torch.argmax(outputs, dim=2).flatten().cpu().tolist())
        train_labels.extend(batch_labels.cpu().tolist())
        
    train_f1 = f1_score(train_labels, train_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(texts_tensor):.4f}, Training F1 Score: {train_f1:.4f}')
    
    # Validation
    model.eval()  
    total_val_loss = 0
    val_predictions = []
    val_labels = []
    
    with torch.no_grad():
        for i in range(0, len(val_texts_tensor), batch_size):
            batch_texts_val = val_texts_tensor[i:i+batch_size]
            batch_labels_val = val_labels_tensor[i:i+batch_size].view(-1)
            outputs_val = model(batch_texts_val)
            val_loss = criterion(outputs_val.view(-1, output_size), batch_labels_val)
            total_val_loss += val_loss.item()
            val_predictions.extend(torch.argmax(outputs_val, dim=2).flatten().cpu().tolist())
            val_labels.extend(batch_labels_val.cpu().tolist())
    
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {total_val_loss/len(val_texts_tensor):.4f}, Validation F1 Score: {val_f1:.4f}')
    if(val_f1>save_loss):
        save_loss = val_f1
        torch.save(model.state_dict(),'t2_gru_glove.pt')
    log_metric = {"Epoch": epoch+1, "Training Loss": total_loss/len(texts_tensor), "Training F1 Score": train_f1, "Validation Loss": total_val_loss/len(val_texts_tensor), "Validation F1 Score": val_f1}
    # wandb.log(log_metric)
print("Finished Training")
    

Epoch [1/45], Training Loss: 0.0359, Training F1 Score: 0.1262
Epoch [1/45], Validation Loss: 0.0060, Validation F1 Score: 0.6207
Epoch [2/45], Training Loss: 0.0052, Training F1 Score: 0.6315
Epoch [2/45], Validation Loss: 0.0045, Validation F1 Score: 0.6262
Epoch [3/45], Training Loss: 0.0044, Training F1 Score: 0.6331
Epoch [3/45], Validation Loss: 0.0040, Validation F1 Score: 0.6269
Epoch [4/45], Training Loss: 0.0039, Training F1 Score: 0.6339
Epoch [4/45], Validation Loss: 0.0036, Validation F1 Score: 0.6287
Epoch [5/45], Training Loss: 0.0035, Training F1 Score: 0.6353
Epoch [5/45], Validation Loss: 0.0033, Validation F1 Score: 0.6294
Epoch [6/45], Training Loss: 0.0031, Training F1 Score: 0.6556
Epoch [6/45], Validation Loss: 0.0030, Validation F1 Score: 0.6413
Epoch [7/45], Training Loss: 0.0028, Training F1 Score: 0.7176
Epoch [7/45], Validation Loss: 0.0028, Validation F1 Score: 0.7553
Epoch [8/45], Training Loss: 0.0024, Training F1 Score: 0.8082
Epoch [8/45], Validation Lo

In [10]:
with open('ATE_test.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

with torch.no_grad():
    model = GRUTagger(input_size, hidden_size, output_size)
    model = model.to(device)
    model.load_state_dict(torch.load('t2_gru_glove.pt'))
    test_outputs = model(test_texts_tensor.to(device)) 
    argmax_indices = torch.argmax(test_outputs, dim=2)
    reshaped_tensor = argmax_indices.view(328, 85)
    fl_out = reshaped_tensor.flatten()
    fl_label = test_labels_tensor.flatten().to(device) 
    f1 = f1_score(fl_out.cpu(), fl_label.cpu(), average='macro') 
    print("F1 Score on Test Set:", f1)

328
F1 Score on Test Set: 0.8656965417858741
