In [15]:
from gensim.models import KeyedVectors
import json
import numpy as np
import torch
from sklearn.metrics import f1_score
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7bad1ab8c810>

In [16]:
with open('ATE_train.json') as file:
    data_train_bio = json.load(file)
with open('ATE_val.json') as file:
    data_val_bio = json.load(file) 

train_texts = [data_train_bio[entry]['text'] for entry in data_train_bio]
train_tag_seqs = [data_train_bio[entry]['labels'] for entry in data_train_bio]

val_texts = [data_val_bio[entry]['text'] for entry in data_val_bio]
val_tag_seqs = [data_val_bio[entry]['labels'] for entry in data_val_bio]

In [17]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
OUT_OF_VOCAB = '<OOV>'
word2vec = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

word_map = {OUT_OF_VOCAB: 0}
label_map  = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

for sentence in train_texts:
    for token in sentence:
        if((token not in word_map) and (token in word2vec)):
            word_map[token] = len(word_map)

vocab_size = len(word_map)
input_size = 300
hidden_size = 32
output_size = 5
num_epochs = 15

word_embeddings = np.zeros((vocab_size, input_size))
for token, index in word_map.items():
    if(token!=OUT_OF_VOCAB):
        word_embeddings[index] = word2vec[token]

word_embeddings[word_map[OUT_OF_VOCAB]] = np.zeros(300)
word_embeddings = torch.tensor(word_embeddings.astype("float32"))

In [18]:
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item() 

def prepare_sentence(tag_to_ix, sentence, tags):
    vectorized_sentence = [word_map[token] if token in word_map else word_map[OUT_OF_VOCAB] for token in sentence]
    vectorized_tag_seq = [tag_to_ix[tag] for tag in tags]
    return torch.tensor(vectorized_sentence), torch.tensor(vectorized_tag_seq)

In [19]:
# Source: https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
class BiLSTM_CRF(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, tag_to_ix):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        
        self.tag_to_ix = tag_to_ix
        self.word_map = word_map
        self.tagset_size = len(tag_to_ix)

        self.embedding = nn.Embedding.from_pretrained(word_embeddings)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True)
        self.fc = nn.Linear(hidden_dim, self.tagset_size)
        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))

        self.transitions.data[tag_to_ix[START_TAG], :] = -1000000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -1000000
        
        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.zeros(2, 1, self.hidden_dim // 2), torch.zeros(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        init_alphas = torch.full((1, self.tagset_size), -1000000.)
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.
        forward_var = init_alphas

        for feat in feats:
            alphas_t = []  
            for next_tag in range(self.tagset_size):
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                trans_score = self.transitions[next_tag].view(1, -1)
                next_tag_var = forward_var + trans_score + emit_score
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        x = self.embedding(sentence).view(len(sentence), 1, -1)
        x, self.hidden = self.lstm(x, self.hidden)
        x = x.view(len(sentence), self.hidden_dim)
        x = self.fc(x)
        return x

    def _score_sentence(self, feats, tags):
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])

        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    
    def _viterbi_decode(self, feats):
        backpointers = []
        init_vvars = torch.full((1, self.tagset_size), -1000000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  
            viterbivars_t = [] 

            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG] 
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq


In [20]:
# import wandb

# wandb.login()
# wandb.init(
#     project="nlp_ass2B", 
#     name=f"BiLSTM_Word2Vec"
# )

In [21]:
def evaluate(model, train_sentence, train_tags, val_sentence, val_tags, optimizer):
    
    model.train()
    for i in range(len(train_sentence)):
        
        raw_sentence, raw_tag_seq = train_sentence[i], train_tags[i]
        vectorized_sentence, vectorized_tag_seq = prepare_sentence(label_map, raw_sentence, raw_tag_seq)
        
        optimizer.zero_grad()
        loss = model.neg_log_likelihood(vectorized_sentence, vectorized_tag_seq)
        loss.backward()
        optimizer.step()

    loss = 0.0
    tag_seqs = []
    tag_seqs_pred = [] 

    model.eval()
    with torch.no_grad():
        for i in range(len(train_sentence)):
            
            raw_sentence, raw_tag_seq = train_sentence[i], train_tags[i]
            vectorized_sentence, vectorized_tag_seq = prepare_sentence(label_map, raw_sentence, raw_tag_seq)
            
            score, tag_seq_pred = model(vectorized_sentence)
            
            loss += model.neg_log_likelihood(vectorized_sentence, vectorized_tag_seq)
            tag_seqs.extend(vectorized_tag_seq.tolist())
            tag_seqs_pred.extend(tag_seq_pred)
    
    avg_loss = loss/len(train_sentence)
    f1 = f1_score(tag_seqs, tag_seqs_pred, average="macro")
    print("Training Loss", avg_loss, "F1 Score", f1)

    loss = 0.0
    tag_seqs = []
    tag_seqs_pred = [] 
    log_metric = {"Epoch": epoch+1, "Training Loss": avg_loss, "Training F1 Score": f1} 
    # wandb.log(log_metric)
    model.eval()
    with torch.no_grad():
        for i in range(len(val_sentence)):
            
            raw_sentence, raw_tag_seq = val_sentence[i], val_tags[i]
            vectorized_sentence, vectorized_tag_seq = prepare_sentence(label_map, raw_sentence, raw_tag_seq)
            
            score, tag_seq_pred = model(vectorized_sentence)
            
            loss += model.neg_log_likelihood(vectorized_sentence, vectorized_tag_seq)
            tag_seqs.extend(vectorized_tag_seq.tolist())
            tag_seqs_pred.extend(tag_seq_pred)
    
    avg_loss = loss/len(val_sentence)
    f1 = f1_score(tag_seqs, tag_seqs_pred, average="macro")
    log_metric = {"Validation Loss": avg_loss, "Validation F1 Score": f1}
    # wandb.log(log_metric)
    print("Validation Loss", avg_loss, "F1 Score", f1)


In [22]:
model = BiLSTM_CRF(input_size, hidden_size, label_map)
optimizer = torch.optim.Adam(model.parameters(), 0.001)

for epoch in range(num_epochs):
    evaluate(model, train_texts, train_tag_seqs, val_texts, val_tag_seqs, optimizer)

Training Loss tensor([5.4380]) F1 Score 0.48851928714228715
Validation Loss tensor([5.3672]) F1 Score 0.4871601572773689
Training Loss tensor([3.7752]) F1 Score 0.5944151765626381
Validation Loss tensor([4.0697]) F1 Score 0.5644564394830209
Training Loss tensor([2.7728]) F1 Score 0.7579654060698889
Validation Loss tensor([3.3201]) F1 Score 0.6856774676369577
Training Loss tensor([2.1940]) F1 Score 0.8166953176552066
Validation Loss tensor([2.9444]) F1 Score 0.721681197864282
Training Loss tensor([1.8170]) F1 Score 0.8520801354639347
Validation Loss tensor([2.7441]) F1 Score 0.758889761905233
Training Loss tensor([1.5385]) F1 Score 0.8732942380881412
Validation Loss tensor([2.6337]) F1 Score 0.7705653705323647
Training Loss tensor([1.3125]) F1 Score 0.8933565938985261
Validation Loss tensor([2.5829]) F1 Score 0.7699595902291652
Training Loss tensor([1.1197]) F1 Score 0.9138341885270383
Validation Loss tensor([2.5708]) F1 Score 0.7718569400842211
Training Loss tensor([0.9510]) F1 Score 0

In [23]:
with open('ATE_test.json') as file:
    data_val_bio = json.load(file) 

test_sentence = [data_val_bio[entry]['text'] for entry in data_val_bio]
test_tags = [data_val_bio[entry]['labels'] for entry in data_val_bio] 

loss = 0.0
tag_seqs = []
tag_seqs_pred = [] 

model.eval()
with torch.no_grad():
    for i in range(len(test_sentence)):
        
        raw_sentence, raw_tag_seq = test_sentence[i], test_tags[i]
        vectorized_sentence, vectorized_tag_seq = prepare_sentence(label_map, raw_sentence, raw_tag_seq)
        
        score, tag_seq_pred = model(vectorized_sentence)
        
        loss += model.neg_log_likelihood(vectorized_sentence, vectorized_tag_seq)
        tag_seqs.extend(vectorized_tag_seq.tolist())
        tag_seqs_pred.extend(tag_seq_pred)

avg_loss = loss/len(test_sentence)
f1 = f1_score(tag_seqs, tag_seqs_pred, average="macro")
print("Validation Loss", avg_loss, "F1 Score", f1)


Validation Loss tensor([3.8585]) F1 Score 0.7583996808462352


In [24]:
torch.save(model.state_dict(),'t2_bilstm_word2vec.pt')