In [166]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
import warnings
from platform import python_version
print(python_version())

3.9.5


In [167]:
### Read in data:
tr_headers = ["index", "word", "ner_tag"]
train_df = pd.read_csv("./data/train", sep=' ', header=None, quoting=3)
train_df.columns = tr_headers

dev_df = pd.read_csv("./data/dev", sep=' ', header=None, quoting=3)
dev_df.columns = tr_headers

test_headers = ["index", "word"]
test_df = pd.read_csv("./data/test", sep=' ', header=None, engine='python', error_bad_lines=False, quoting=3)
test_df.columns = test_headers

In [168]:
train_df.iloc[70:76]

Unnamed: 0,index,word,ner_tag
70,28,advice,O
71,29,was,O
72,30,clearer,O
73,31,.,O
74,1,"""",O
75,2,We,O


In [169]:
train_df.head(15)

Unnamed: 0,index,word,ner_tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
5,6,boycott,O
6,7,British,B-MISC
7,8,lamb,O
8,9,.,O
9,1,Peter,B-PER


In [170]:
# Slight cleaning on num:
train_df["word_formatted"] = train_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)
dev_df["word_formatted"] = dev_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)
test_df["word_formatted"] = test_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)

In [171]:
# Get the count of each word:
#word-type = word
cnt_d = {}
for row in train_df.iterrows():
    if row[1]["word"] in cnt_d:
        cnt_d[row[1]["word"]] += 1
    else:
        cnt_d[row[1]["word"]] = 1

In [172]:
threshold = 2 #No threshold = 1
#unknown_cnt = 0
unknown_word_lst = []   #We want to keep track of unknown words but group together
for k, v in cnt_d.items():
    if v < threshold:
        #unknown_cnt += v
        unknown_word_lst.append(k)
    else:
        continue

In [173]:
def replace_unk_train(word):
    if word in unknown_word_lst:
        return "<unk>"
    else:
        return word

In [174]:
def replace_unk_dev(word):
    if word in unknown_word_lst:
        return "<unk>"
    elif word not in train_words:
        return "<unk>"
    else:
        return word

In [175]:
#Replace with <unk> Train:
train_df["word_formatted"] = train_df["word"].apply(replace_unk_train)

In [176]:
train_df["word_formatted"] = train_df["word_formatted"].astype(str)
train_vocab_size = len(np.unique(train_df["word_formatted"]))
train_words = np.unique(train_df["word_formatted"])

In [177]:
#Replace with <unk> Dev:
dev_df["word_formatted"] = dev_df["word"].apply(replace_unk_dev)
dev_df["word_formatted"] = dev_df["word_formatted"].astype(str)

In [178]:
dev_df

Unnamed: 0,index,word,ner_tag,word_formatted
0,1,CRICKET,O,CRICKET
1,2,-,O,-
2,3,LEICESTERSHIRE,B-ORG,<unk>
3,4,TAKE,O,TAKE
4,5,OVER,O,OVER
...,...,...,...,...
51573,1,--,O,--
51574,2,Dhaka,B-ORG,Dhaka
51575,3,Newsroom,I-ORG,Newsroom
51576,4,880-2-506363,O,<unk>


In [179]:
#Format the data by sentences TRAIN:
def format_data(df):
    train_formatted = []
    #init beginning:
    first_word = df.iloc[0]
    sentence_x = [first_word["word_formatted"]]
    sentence_y = [first_word["ner_tag"]]
    
    for row in df.iloc[1:].iterrows():
        #print(row)
        if row[1]["index"] == 1:
            #print(row[1]["word"])
            train_formatted.append([sentence_x, sentence_y])

            sentence_x, sentence_y = [], []
            sentence_x.append(row[1]["word_formatted"])
            sentence_y.append(row[1]["ner_tag"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([sentence_x, sentence_y])
        else:
            sentence_x.append(row[1]["word_formatted"])
            sentence_y.append(row[1]["ner_tag"])
    return train_formatted

In [180]:
#Format the data by sentences TEST:
def format_data_test(df):
    test_formatted = []
    #init beginning:
    first_word = df.iloc[0]
    sentence_x = [first_word["word_formatted"]]
    
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            test_formatted.append(sentence_x)

            sentence_x = []
            sentence_x.append(row[1]["word_formatted"])
            if row[0] == (df.shape[0]-1):
                test_formatted.append(sentence_x)
        else:
            sentence_x.append(row[1]["word_formatted"])
    
    return test_formatted

In [181]:
train_formatted = format_data(train_df)
dev_formatted = format_data(dev_df)

In [182]:
#Create Word Map for vocab:
word_map = {"<pad>":0}
for i, word in enumerate(set(train_df["word_formatted"])):
    word_map[word] = i+1

In [183]:
#Create Word Map for ner_tag:
ner_map = {"<pad>":-1}
for i, word in enumerate(set(train_df["ner_tag"])):
    ner_map[word] = i

In [184]:
ner_map_without_pad = {}
for i, word in enumerate(set(train_df["ner_tag"])):
    ner_map_without_pad[word] = i

In [185]:
longest_train_sent = 0
for sentence in train_formatted:
    sentence_len = len(sentence[0])
    if sentence_len > longest_train_sent:
        longest_train_sent = sentence_len

In [186]:
longest_train_sent

113

In [187]:
#Map words in sentences to corresponding values:
def pad_sentences(sentences_formatted):
    train_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        word_lst = sentence[0]
        ner_lst = sentence[1]
        mapped_word_lst, mapped_ner_lst = [], []
        cnt += len(word_lst)
        for word in word_lst:
            mapped_word_lst.append(word_map[word])
        for ner in ner_lst:
            mapped_ner_lst.append(ner_map[ner])

        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_
        mapped_ner_lst = mapped_ner_lst + [-1] * diff_

        train_padded.append([mapped_word_lst, mapped_ner_lst])
    print(cnt)
    return train_padded

In [188]:
train_padded = pad_sentences(train_formatted)
dev_padded = pad_sentences(dev_formatted)

204567
51578


In [189]:
#Map words in sentences to corresponding values:
def pad_test_sentences(sentences_formatted):
    test_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        mapped_word_lst = []
        cnt += len(sentence)
        for word in sentence:
            mapped_word_lst.append(word_map[word])

        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_

        test_padded.append(mapped_word_lst)
    print(cnt)
    return test_padded

### Task 1: Simple Bidirectional LSTM Model:

In [190]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn

In [210]:
class BLSTM(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        
        lstm_hidden_dim = 256
        lstm_num_layers = 1
        linear_output_dim =128
        output_dim = 10
        
        self.embeddings = nn.Embedding(vocab_size, 100)
        self.lstm = nn.LSTM(input_size=100, hidden_size=256,
                          num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear1 = nn.Linear(512, 128)
        self.linear2 = nn.Linear(128, 9)
        self.elu = nn.ELU()
        
    def forward(self, inputs):
        #print(inputs.shape)
        embeds = self.embeddings(inputs)
        #print(embeds.shape)
        lstm_out, self.hidden = self.lstm(embeds.view(len(inputs), 1, -1))
        lstm_out_dropped = self.dropout(lstm_out)
        out = self.linear1(lstm_out_dropped.view(len(inputs), -1))
        #linear_out_dropped = self.dropout(out)
        #l2_out = self.linear2(linear_out_dropped)
        elu_out = self.elu(out)
        l2_out = self.linear2(elu_out)
        log_probs = F.log_softmax(l2_out, dim=1)
        return log_probs


In [192]:
EMBEDDING_DIM = 100
VOCAB_SIZE = train_vocab_size+1 #added <pad> word
n_epochs = 20
trainloader = torch.utils.data.DataLoader(train_padded, batch_size=16, num_workers=1)
devloader = torch.utils.data.DataLoader(dev_padded, batch_size=16, num_workers=1)
blstm = BLSTM(VOCAB_SIZE)
criterion = nn.CrossEntropyLoss(ignore_index=-1, size_average=True) #therefore no need for softmax
#criterion = nn.NLLLoss()
# optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(blstm.parameters(), lr=0.25, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

test_loss_min = 10000

for epoch in range(n_epochs):
    #scheduler.step()
    print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
    train_loss = 0
    test_loss = 0
    
    blstm.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
#         print(data)
#         print(target)
        #print(torch.cat(data,dim=0).reshape(1,400,316))
        #print(torch.cat(data,dim=0).size(0)) I think the problem is here.
        output = blstm(torch.cat(data,dim=0))

        # Compute loss
        loss = criterion(output, torch.cat(target,dim=0))

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

                # Print statistics
        #train_loss += loss.item()*torch.cat(data,dim=0).size(0)
        train_loss += loss
        
    with torch.no_grad():
        for data, target in devloader:
            output = blstm(torch.cat(data,dim=0))
            loss = criterion(output, torch.cat(target,dim=0))
#             test_loss += loss.item()*torch.cat(data,dim=0).size(0)
            test_loss += loss
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(devloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(blstm.state_dict(), 'blstm1.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch-0 lr: 0.25




Epoch: 1 	Training Loss: 0.048592 	Test Loss: 0.038024
Epoch-1 lr: 0.25


KeyboardInterrupt: 

In [193]:
#Format the test_data:
test_df["word_formatted"] = test_df["word"].apply(replace_unk_dev)
test_df["word_formatted"] = test_df["word_formatted"].astype(str)

test_formatted = format_data_test(test_df)
test_padded = pad_test_sentences(test_formatted)

46666


In [194]:
#Load in the best model from above:
blstm.load_state_dict(torch.load('blstm1.pt'))

<All keys matched successfully>

In [195]:
# Calculate Accuracy from trained model:
def predict_test(model, dataloader):
    prediction_list = []
    with torch.no_grad():
        for data in dataloader:
            output = model(torch.cat(data,dim=0))
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    return prediction_list

In [196]:
# Calculate Accuracy from trained model:
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
        for data, target in dataloader:
            output = model(torch.cat(data,dim=0))
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    return prediction_list

In [197]:
def unravel_predictions_test(data, pred):
    overall_pred = []
    for i, sentence in enumerate(data):
        non_padded_pred = len(np.nonzero(sentence)[0])
        pred_i = pred[i].tolist()[0:non_padded_pred]
        overall_pred.append(pred_i)
    return overall_pred

In [198]:
def unravel_predictions(data, pred):
    overall_pred = []
    for i, sentence in enumerate(data):
        actual_sentence = sentence[0]
        non_padded_pred = len(np.nonzero(actual_sentence)[0])
        pred_i = pred[i].tolist()[0:non_padded_pred]
        overall_pred.append(pred_i)
    return overall_pred

In [199]:
def convert_predictions(pred):
    overall_pred = []
    for sentence in pred:
        for idx in sentence:
            overall_pred.append(list(ner_map_without_pad.keys())[idx])
    return overall_pred

In [200]:
def accuracy(y_true, y_pred):
    score = sum(y_true == y_pred)/len(y_pred)
    return score

In [201]:
#predict on dev:
devloader = torch.utils.data.DataLoader(dev_padded, batch_size=1, num_workers=1) #need to do 1 at a time:
predictions_dev = predict(blstm, devloader)
predictions_dev = unravel_predictions(dev_padded, predictions_dev)
predictions_dev = convert_predictions(predictions_dev)

In [202]:
y_true = np.array(dev_df["ner_tag"])
print(len(y_true))
print(len(predictions_dev))
print("Dev Accuracy:", accuracy(y_true, predictions_dev))

51578
51578
Dev Accuracy: 0.8532513862499516


In [None]:
###Best score: lr=.25, gamm=.5, step=5, momentum=.9, epoch=20, batch_size-16 - 92.45% acc, 59.55 F1, ~2hrs

In [205]:
def write_results(name, y_true, y_pred, df):
    with open(name, 'w') as f:
        for row in df.iloc[0:].iterrows():
            f.write(str(row[1]["index"]))
            f.write(" ")
            f.write(str(row[1]["word"]))
            f.write(" ")
            f.write(y_true[row[0]])
            f.write(" ")
            f.write(y_pred[row[0]])
            f.write("\n")

In [206]:
write_results("dev1.out", y_true, predictions_dev, dev_df)

In [207]:
#predict on test:
testloader = torch.utils.data.DataLoader(test_padded, batch_size=1, num_workers=1)
predictions_test = predict_test(blstm, testloader)
predictions_test = unravel_predictions_test(test_padded, predictions_test)
predictions_test = convert_predictions(predictions_test)

In [208]:
def write_results_test(name, y_pred, df):
    with open(name, 'w') as f:
        for row in df.iloc[0:].iterrows():
            f.write(str(row[1]["index"]))
            f.write(" ")
            f.write(str(row[1]["word"]))
            f.write(" ")
            f.write(y_pred[row[0]])
            f.write("\n")

In [209]:
write_results_test("test1.out", predictions_test, test_df)

### Task 2: Using GloVe Word Embeddings:

In [104]:
with open("./glove.6B.100d","r",encoding="UTF-8") as f:
    word2vec={}
    for word_embedding in f:
        word_split = word_embedding.split()
        word = word_split[0]
        word2vec[word] = np.array(word_split[1:], dtype=np.float64)

In [None]:
### Logic: Scratch previous <unk> and <num> tokens as Glove might handle it, vs. would set to 0

In [266]:
#Create Word Map for vocab:
word_map_2 = {}
for i, word in enumerate(set(train_df["word"]).union(set(dev_df["word"]))):
    word_map_2[word] = i+1
word_map_2["<unk>"] = i+1 #leave last row to represent <unk>
#Expand vocab to cover dev:

In [269]:
EMBEDDING_DIM = 101
VOCAB_SIZE = len(word_map_2)

In [270]:
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, idx in word_map.items():
    if word in word2vec:
        word_embedding = word2vec[word]
        embedding_matrix[idx,:] = np.concatenate((word_embedding, [0])) #final character 0 means lowercase
    elif word.lower() in word2vec: #Attempt to solve case insensitive
        word_embedding = word2vec[word.lower()]
        embedding_matrix[idx,:] = np.concatenate((word_embedding, [1])) #final character 1 means uppercase
    else:
        continue

embedding_blstm2 = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM) 
embedding_blstm2.load_state_dict({"weight": torch.tensor(embedding_matrix)})

<All keys matched successfully>

In [239]:
### Have to fix padding from before:
#Format the data by sentences TRAIN:
def format_data_glove(df):
    train_formatted = []
    #init beginning:
    first_word = df.iloc[0]
    sentence_x = [first_word["word"]]
    sentence_y = [first_word["ner_tag"]]
    
    for row in df.iloc[1:].iterrows():
        #print(row)
        if row[1]["index"] == 1:
            #print(row[1]["word"])
            train_formatted.append([sentence_x, sentence_y])

            sentence_x, sentence_y = [], []
            sentence_x.append(row[1]["word"])
            sentence_y.append(row[1]["ner_tag"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([sentence_x, sentence_y])
        else:
            sentence_x.append(row[1]["word"])
            sentence_y.append(row[1]["ner_tag"])
    return train_formatted

In [287]:
#Format the data by sentences TEST:
def format_data_test_glove(df):
    test_formatted = []
    #init beginning:
    first_word = df.iloc[0]
    sentence_x = [first_word["word_formatted"]]
    
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            test_formatted.append(sentence_x)

            sentence_x = []
            sentence_x.append(row[1]["word_formatted"])
            if row[0] == (df.shape[0]-1):
                test_formatted.append(sentence_x)
        else:
            sentence_x.append(row[1]["word_formatted"])
    
    return test_formatted

In [244]:
train_formatted_glove = format_data_glove(train_df)
dev_formatted_glove = format_data_glove(dev_df)

In [281]:
def replace_unk_test_glove(word):
    if word in word_map_2:
        return word
    else:
        return "<unk>"

In [288]:
#Format the test_data Glove:
test_df["word_formatted"] = test_df["word"].apply(replace_unk_test_glove)
test_df["word_formatted"] = test_df["word_formatted"].astype(str)

test_formatted_glove = format_data_test_glove(test_df)

In [279]:
#Map words in sentences to corresponding values:
def pad_sentences_glove(sentences_formatted):
    train_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        word_lst = sentence[0]
        ner_lst = sentence[1]
        mapped_word_lst, mapped_ner_lst = [], []
        cnt += len(word_lst)
        for word in word_lst:
            mapped_word_lst.append(word_map_2[word])
        for ner in ner_lst:
            mapped_ner_lst.append(ner_map[ner])

        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_
        mapped_ner_lst = mapped_ner_lst + [-1] * diff_

        train_padded.append([mapped_word_lst, mapped_ner_lst])
    print(cnt)
    return train_padded

In [280]:
train_padded_glove = pad_sentences_glove(train_formatted_glove)
dev_padded_glove = pad_sentences_glove(dev_formatted_glove)

204567
51578


In [289]:
#Map words in sentences to corresponding values:
def pad_test_sentences_glove(sentences_formatted):
    test_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        mapped_word_lst = []
        cnt += len(sentence)
        for word in sentence:
            mapped_word_lst.append(word_map_2[word])

        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_

        test_padded.append(mapped_word_lst)
    print(cnt)
    return test_padded

In [290]:
test_padded_glove = pad_test_sentences_glove(test_formatted_glove)

46666


In [291]:
class BLSTM_2(nn.Module):
    
    def __init__(self, embeddings):
        super().__init__()
        
        lstm_hidden_dim = 256
        lstm_num_layers = 1
        linear_output_dim =128
        output_dim = 10
        
        self.embeddings = embeddings
        self.lstm = nn.LSTM(input_size=101, hidden_size=256,
                          num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear1 = nn.Linear(512, 128)
        self.linear2 = nn.Linear(128, 9)
        self.elu = nn.ELU()
        
    def forward(self, inputs):
        #print(inputs.shape)
        embeds = self.embeddings(inputs)
        #print(embeds.shape)
        lstm_out, self.hidden = self.lstm(embeds.view(len(inputs), 1, -1))
        lstm_out_dropped = self.dropout(lstm_out)
        out = self.linear1(lstm_out_dropped.view(len(inputs), -1))
        #linear_out_dropped = self.dropout(out)
        #l2_out = self.linear2(linear_out_dropped)
        elu_out = self.elu(out)
        l2_out = self.linear2(elu_out)
        log_probs = F.log_softmax(l2_out, dim=1)
        return log_probs


In [307]:
EMBEDDING_DIM = 101
VOCAB_SIZE = len(word_map_2)
n_epochs = 10
trainloader = torch.utils.data.DataLoader(train_padded_glove, batch_size=12, num_workers=1)
devloader = torch.utils.data.DataLoader(dev_padded_glove, batch_size=12, num_workers=1)
blstm2 = BLSTM_2(embedding_blstm2)
criterion = nn.CrossEntropyLoss(ignore_index=-1, size_average=True) #therefore no need for softmax
#criterion = nn.NLLLoss()
# optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(blstm2.parameters(), lr=0.01, momentum=0.7)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

test_loss_min = 10000

for epoch in range(n_epochs):
    #scheduler.step()
    print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
    train_loss = 0
    test_loss = 0
    
    blstm2.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
#         print(data)
#         print(target)
        #print(torch.cat(data,dim=0).reshape(1,400,316))
        #print(torch.cat(data,dim=0).size(0)) I think the problem is here.
        output = blstm2(torch.cat(data,dim=0))

        # Compute loss
        loss = criterion(output, torch.cat(target,dim=0))

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

                # Print statistics
        #train_loss += loss.item()*torch.cat(data,dim=0).size(0)
        train_loss += loss
        
    with torch.no_grad():
        for data, target in devloader:
            output = blstm2(torch.cat(data,dim=0))
            loss = criterion(output, torch.cat(target,dim=0))
#             test_loss += loss.item()*torch.cat(data,dim=0).size(0)
            test_loss += loss
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(devloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(blstm2.state_dict(), 'blstm2.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch-0 lr: 0.001
Epoch: 1 	Training Loss: 0.099626 	Test Loss: 0.071618
Epoch-1 lr: 0.001
Epoch: 2 	Training Loss: 0.068658 	Test Loss: 0.066757
Epoch-2 lr: 0.001
Epoch: 3 	Training Loss: 0.062385 	Test Loss: 0.060342
Epoch-3 lr: 0.001
Epoch: 4 	Training Loss: 0.054688 	Test Loss: 0.053673
Epoch-4 lr: 0.001
Epoch: 5 	Training Loss: 0.047851 	Test Loss: 0.049047
Epoch-5 lr: 0.001
Epoch: 6 	Training Loss: 0.043159 	Test Loss: 0.046144
Epoch-6 lr: 0.001
Epoch: 7 	Training Loss: 0.040122 	Test Loss: 0.044400
Epoch-7 lr: 0.001
Epoch: 8 	Training Loss: 0.038091 	Test Loss: 0.043286
Epoch-8 lr: 0.001


KeyboardInterrupt: 

In [308]:
# Load in the best model from the given run:
blstm2.load_state_dict(torch.load('blstm2.pt'))

<All keys matched successfully>

In [309]:
#predict on dev:
devloader = torch.utils.data.DataLoader(dev_padded_glove, batch_size=1, num_workers=1) #need to do 1 at a time:
predictions_dev = predict(blstm2, devloader)
predictions_dev = unravel_predictions(dev_padded, predictions_dev)
predictions_dev = convert_predictions(predictions_dev)

In [310]:
#Dev accuracy:
y_true = np.array(dev_df["ner_tag"])
print(len(y_true))
print(len(predictions_dev))
print("Dev Accuracy:", accuracy(y_true, predictions_dev))

51578
51578
Dev Accuracy: 0.8637597425258832


In [301]:
write_results("dev2.out", y_true, predictions_dev, dev_df)

In [302]:
#predict on dev:
testloader = torch.utils.data.DataLoader(test_padded_glove, batch_size=1, num_workers=1) #need to do 1 at a time:
predictions_test = predict_test(blstm2, testloader)
predictions_test = unravel_predictions_test(test_padded, predictions_test)
predictions_test = convert_predictions(predictions_test)

In [303]:
write_results_test("test2.out", predictions_test, test_df)

In [None]:
#RUN Command line: