# Fasttext Embedding

In [1]:
from gensim.models import KeyedVectors
import json
import numpy as np
import torch
from sklearn.metrics import f1_score
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

In [2]:
label_map = {'B', 'I', 'O'}

In [3]:
word2vec_model_path = '../cc.en.300.bin'
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)


with open('ATE_train.json', 'r') as file:
    dataset = json.load(file)

print(len(dataset))

max_seq_len = 85
word_embeddings = []
labels = []

for key, value in dataset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text] 
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)
print(word_embeddings_array.shape)
texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
labels_tensor = torch.tensor(labels)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xba in position 0: invalid start byte

In [5]:
with open('ATE_val.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

219


In [None]:
# import wandb
# wandb.login()

In [None]:
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"RNN_Fasttext"
# )

In [6]:
class RNNTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNTagger, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out)
        return out

input_size = 300
hidden_size = 256
output_size = 100

model = RNNTagger(input_size, hidden_size, output_size)

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 45
batch_size = 32 

texts_tensor = texts_tensor.to(device)
labels_tensor = labels_tensor.to(device)
val_texts_tensor = test_texts_tensor
val_labels_tensor =  test_labels_tensor
val_texts_tensor = val_texts_tensor.to(device)
val_labels_tensor = val_labels_tensor.to(device)

save_loss = 0

for epoch in range(num_epochs):
    
    model.train()  
    total_loss=0
    train_predictions = []
    train_labels = []
    for i in range(0, len(texts_tensor), batch_size):
        optimizer.zero_grad()
        batch_texts = texts_tensor[i:i+batch_size]
        batch_labels = labels_tensor[i:i+batch_size].view(-1)
        outputs = model(batch_texts)
        loss = criterion(outputs.view(-1, output_size), batch_labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_predictions.extend(torch.argmax(outputs, dim=2).flatten().cpu().tolist())
        train_labels.extend(batch_labels.cpu().tolist())
        
    train_f1 = f1_score(train_labels, train_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(texts_tensor):.4f}, Training F1 Score: {train_f1:.4f}')
    
    # Validation
    model.eval()  
    total_val_loss = 0
    val_predictions = []
    val_labels = []
    
    with torch.no_grad():
        for i in range(0, len(val_texts_tensor), batch_size):
            batch_texts_val = val_texts_tensor[i:i+batch_size]
            batch_labels_val = val_labels_tensor[i:i+batch_size].view(-1)
            outputs_val = model(batch_texts_val)
            val_loss = criterion(outputs_val.view(-1, output_size), batch_labels_val)
            total_val_loss += val_loss.item()
            val_predictions.extend(torch.argmax(outputs_val, dim=2).flatten().cpu().tolist())
            val_labels.extend(batch_labels_val.cpu().tolist())
    
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {total_val_loss/len(val_texts_tensor):.4f}, Validation F1 Score: {val_f1:.4f}')
    if(val_f1>save_loss):
        save_loss = val_f1
        torch.save(model.state_dict(),'t2_rnn_fasttext.pt')
    log_metric = {"Epoch": epoch+1, "Training Loss": total_loss/len(texts_tensor), "Training F1 Score": train_f1, "Validation Loss": total_val_loss/len(val_texts_tensor), "Validation F1 Score": val_f1}
    # wandb.log(log_metric)
print("Finished Training")
    

  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/45], Training Loss: 0.0325, Training F1 Score: 0.0172
Epoch [1/45], Validation Loss: 0.0044, Validation F1 Score: 0.3489
Epoch [2/45], Training Loss: 0.0042, Training F1 Score: 0.3544
Epoch [2/45], Validation Loss: 0.0035, Validation F1 Score: 0.3573
Epoch [3/45], Training Loss: 0.0032, Training F1 Score: 0.3669
Epoch [3/45], Validation Loss: 0.0025, Validation F1 Score: 0.3979
Epoch [4/45], Training Loss: 0.0023, Training F1 Score: 0.4053
Epoch [4/45], Validation Loss: 0.0020, Validation F1 Score: 0.4354
Epoch [5/45], Training Loss: 0.0020, Training F1 Score: 0.4561
Epoch [5/45], Validation Loss: 0.0018, Validation F1 Score: 0.4804
Epoch [6/45], Training Loss: 0.0018, Training F1 Score: 0.4931
Epoch [6/45], Validation Loss: 0.0016, Validation F1 Score: 0.5189
Epoch [7/45], Training Loss: 0.0016, Training F1 Score: 0.6061
Epoch [7/45], Validation Loss: 0.0014, Validation F1 Score: 0.6967
Epoch [8/45], Training Loss: 0.0015, Training F1 Score: 0.7114
Epoch [8/45], Validation Lo

In [7]:
with open('ATE_test.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

with torch.no_grad():
    model = RNNTagger(input_size, hidden_size, output_size)
    model = model.to(device)
    model.load_state_dict(torch.load('t2_rnn_fasttext.pt'))
    test_outputs = model(test_texts_tensor.to(device)) 
    argmax_indices = torch.argmax(test_outputs, dim=2)
    reshaped_tensor = argmax_indices.view(328, 85)
    fl_out = reshaped_tensor.flatten()
    fl_label = test_labels_tensor.flatten().to(device) 
    f1 = f1_score(fl_out.cpu(), fl_label.cpu(), average='macro') 
    print("F1 Score on Test Set:", f1)

328
F1 Score on Test Set: 0.869531570009971


In [None]:
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"LSTM_Fasttext"
# )

In [8]:
class LSTMTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMTagger, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out

input_size = 300
hidden_size = 256
output_size = 100

model = LSTMTagger(input_size, hidden_size, output_size) # Move model to GPU if available

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 45
batch_size = 32 

texts_tensor = texts_tensor.to(device)
labels_tensor = labels_tensor.to(device)
val_texts_tensor = test_texts_tensor
val_labels_tensor =  test_labels_tensor
val_texts_tensor = val_texts_tensor.to(device)
val_labels_tensor = val_labels_tensor.to(device)

save_loss = 0

for epoch in range(num_epochs):
    
    model.train()  
    total_loss=0
    train_predictions = []
    train_labels = []
    for i in range(0, len(texts_tensor), batch_size):
        optimizer.zero_grad()
        batch_texts = texts_tensor[i:i+batch_size]
        batch_labels = labels_tensor[i:i+batch_size].view(-1)
        outputs = model(batch_texts)
        loss = criterion(outputs.view(-1, output_size), batch_labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_predictions.extend(torch.argmax(outputs, dim=2).flatten().cpu().tolist())
        train_labels.extend(batch_labels.cpu().tolist())
        
    train_f1 = f1_score(train_labels, train_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(texts_tensor):.4f}, Training F1 Score: {train_f1:.4f}')
    
    # Validation
    model.eval()  
    total_val_loss = 0
    val_predictions = []
    val_labels = []
    
    with torch.no_grad():
        for i in range(0, len(val_texts_tensor), batch_size):
            batch_texts_val = val_texts_tensor[i:i+batch_size]
            batch_labels_val = val_labels_tensor[i:i+batch_size].view(-1)
            outputs_val = model(batch_texts_val)
            val_loss = criterion(outputs_val.view(-1, output_size), batch_labels_val)
            total_val_loss += val_loss.item()
            val_predictions.extend(torch.argmax(outputs_val, dim=2).flatten().cpu().tolist())
            val_labels.extend(batch_labels_val.cpu().tolist())
    
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {total_val_loss/len(val_texts_tensor):.4f}, Validation F1 Score: {val_f1:.4f}')
    if(val_f1>save_loss):
        save_loss = val_f1
        torch.save(model.state_dict(),'t2_lstm_fasttext.pt')
    log_metric = {"Epoch": epoch+1, "Training Loss": total_loss/len(texts_tensor), "Training F1 Score": train_f1, "Validation Loss": total_val_loss/len(val_texts_tensor), "Validation F1 Score": val_f1}
    # wandb.log(log_metric)
print("Finished Training")
    

Epoch [1/45], Training Loss: 0.0586, Training F1 Score: 0.0568
Epoch [1/45], Validation Loss: 0.0124, Validation F1 Score: 0.4645
Epoch [2/45], Training Loss: 0.0085, Training F1 Score: 0.5858
Epoch [2/45], Validation Loss: 0.0056, Validation F1 Score: 0.6248
Epoch [3/45], Training Loss: 0.0047, Training F1 Score: 0.6364
Epoch [3/45], Validation Loss: 0.0042, Validation F1 Score: 0.6344
Epoch [4/45], Training Loss: 0.0036, Training F1 Score: 0.6423
Epoch [4/45], Validation Loss: 0.0034, Validation F1 Score: 0.6374
Epoch [5/45], Training Loss: 0.0030, Training F1 Score: 0.6444
Epoch [5/45], Validation Loss: 0.0029, Validation F1 Score: 0.6397
Epoch [6/45], Training Loss: 0.0025, Training F1 Score: 0.6462
Epoch [6/45], Validation Loss: 0.0025, Validation F1 Score: 0.6434
Epoch [7/45], Training Loss: 0.0021, Training F1 Score: 0.6478
Epoch [7/45], Validation Loss: 0.0022, Validation F1 Score: 0.6482
Epoch [8/45], Training Loss: 0.0019, Training F1 Score: 0.6972
Epoch [8/45], Validation Lo

In [9]:
with open('ATE_test.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

with torch.no_grad():
    model = LSTMTagger(input_size, hidden_size, output_size)
    model = model.to(device)
    model.load_state_dict(torch.load('t2_lstm_fasttext.pt'))
    test_outputs = model(test_texts_tensor.to(device)) 
    argmax_indices = torch.argmax(test_outputs, dim=2)
    reshaped_tensor = argmax_indices.view(328, 85)
    fl_out = reshaped_tensor.flatten()
    fl_label = test_labels_tensor.flatten().to(device) 
    f1 = f1_score(fl_out.cpu(), fl_label.cpu(), average='macro') 
    print("F1 Score on Test Set:", f1)

328
F1 Score on Test Set: 0.8833741016717943


In [None]:
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"GRU_Fasttext"
# )

In [10]:
class GRUTagger(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(GRUTagger, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.gru1 = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.5) 

    def forward(self, x):
        out, _ = self.gru(x)
        out = torch.relu(out)
        out, _ = self.gru1(out)
        out = torch.relu(out)
        out = self.dropout(out)  
        out = self.fc1(out)
        return out
    
input_size = 300
hidden_size = 256
output_size = 27

model = GRUTagger(input_size, hidden_size, output_size)

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 45
batch_size = 32 

texts_tensor = texts_tensor.to(device)
labels_tensor = labels_tensor.to(device)
val_texts_tensor = test_texts_tensor
val_labels_tensor =  test_labels_tensor
val_texts_tensor = val_texts_tensor.to(device)
val_labels_tensor = val_labels_tensor.to(device)

save_loss = 0

for epoch in range(num_epochs):
    
    model.train()  
    total_loss=0
    train_predictions = []
    train_labels = []
    for i in range(0, len(texts_tensor), batch_size):
        optimizer.zero_grad()
        batch_texts = texts_tensor[i:i+batch_size]
        batch_labels = labels_tensor[i:i+batch_size].view(-1)
        outputs = model(batch_texts)
        loss = criterion(outputs.view(-1, output_size), batch_labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
        train_predictions.extend(torch.argmax(outputs, dim=2).flatten().cpu().tolist())
        train_labels.extend(batch_labels.cpu().tolist())
        
    train_f1 = f1_score(train_labels, train_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {total_loss/len(texts_tensor):.4f}, Training F1 Score: {train_f1:.4f}')
    
    # Validation
    model.eval()  
    total_val_loss = 0
    val_predictions = []
    val_labels = []
    
    with torch.no_grad():
        for i in range(0, len(val_texts_tensor), batch_size):
            batch_texts_val = val_texts_tensor[i:i+batch_size]
            batch_labels_val = val_labels_tensor[i:i+batch_size].view(-1)
            outputs_val = model(batch_texts_val)
            val_loss = criterion(outputs_val.view(-1, output_size), batch_labels_val)
            total_val_loss += val_loss.item()
            val_predictions.extend(torch.argmax(outputs_val, dim=2).flatten().cpu().tolist())
            val_labels.extend(batch_labels_val.cpu().tolist())
    
    val_f1 = f1_score(val_labels, val_predictions, average='macro')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {total_val_loss/len(val_texts_tensor):.4f}, Validation F1 Score: {val_f1:.4f}')
    if(val_f1>save_loss):
        save_loss = val_f1
        torch.save(model.state_dict(),'t2_gru_fasttext.pt')
    log_metric = {"Epoch": epoch+1, "Training Loss": total_loss/len(texts_tensor), "Training F1 Score": train_f1, "Validation Loss": total_val_loss/len(val_texts_tensor), "Validation F1 Score": val_f1}
    # wandb.log(log_metric)
print("Finished Training")
    

Epoch [1/45], Training Loss: 0.0417, Training F1 Score: 0.0939
Epoch [1/45], Validation Loss: 0.0087, Validation F1 Score: 0.5974
Epoch [2/45], Training Loss: 0.0060, Training F1 Score: 0.6228
Epoch [2/45], Validation Loss: 0.0041, Validation F1 Score: 0.6317
Epoch [3/45], Training Loss: 0.0037, Training F1 Score: 0.6423
Epoch [3/45], Validation Loss: 0.0034, Validation F1 Score: 0.6380
Epoch [4/45], Training Loss: 0.0032, Training F1 Score: 0.6437
Epoch [4/45], Validation Loss: 0.0029, Validation F1 Score: 0.6391
Epoch [5/45], Training Loss: 0.0026, Training F1 Score: 0.6456
Epoch [5/45], Validation Loss: 0.0023, Validation F1 Score: 0.6440
Epoch [6/45], Training Loss: 0.0021, Training F1 Score: 0.6486
Epoch [6/45], Validation Loss: 0.0020, Validation F1 Score: 0.6472
Epoch [7/45], Training Loss: 0.0018, Training F1 Score: 0.6536
Epoch [7/45], Validation Loss: 0.0018, Validation F1 Score: 0.6476
Epoch [8/45], Training Loss: 0.0016, Training F1 Score: 0.6712
Epoch [8/45], Validation Lo

In [11]:
with open('ATE_test.json', 'r') as file:
    valset = json.load(file)

print(len(valset))
max_seq_len = 85

word_embeddings = []
labels = []

for key, value in valset.items():
    text = value['text']
    label_seq = value['labels']

    label_list = list(label_map)  # Store index of each 27 classes created
    label_indices = [label_list.index(label) for label in label_seq]
    label_indices_padded = label_indices[:max_seq_len] + [0] * (max_seq_len - len(label_indices))
    labels.append(label_indices_padded)

    vectors = [word2vec_model[word] if word in word2vec_model else np.zeros(300) for word in text]
    vectors_padded = [np.pad(vec, (0, 300), constant_values=0)[:300] for vec in vectors]
    vectors_padded += [np.zeros(300)] * (max_seq_len - len(vectors_padded))
    word_embeddings.append(vectors_padded)

word_embeddings_array = np.array(word_embeddings)

test_texts_tensor = torch.tensor(word_embeddings_array, dtype=torch.float32)
test_labels_tensor = torch.tensor(labels)

with torch.no_grad():
    model = GRUTagger(input_size, hidden_size, output_size)
    model = model.to(device)
    model.load_state_dict(torch.load('t2_gru_fasttext.pt'))
    test_outputs = model(test_texts_tensor.to(device)) 
    argmax_indices = torch.argmax(test_outputs, dim=2)
    reshaped_tensor = argmax_indices.view(328, 85)
    fl_out = reshaped_tensor.flatten()
    fl_label = test_labels_tensor.flatten().to(device) 
    f1 = f1_score(fl_out.cpu(), fl_label.cpu(), average='macro') 
    print("F1 Score on Test Set:", f1)

328
F1 Score on Test Set: 0.8798521574783909
