In [376]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
import warnings
from platform import python_version
print(python_version())

3.9.5


In [377]:
### Read in data:
tr_headers = ["index", "word", "ner_tag"]
train_df = pd.read_csv("./data/train", sep=' ', header=None, quoting=3)
train_df.columns = tr_headers

dev_df = pd.read_csv("./data/dev", sep=' ', header=None, quoting=3)
dev_df.columns = tr_headers

test_headers = ["index", "word"]
test_df = pd.read_csv("./data/test", sep=' ', header=None, engine='python', error_bad_lines=False, quoting=3)
test_df.columns = test_headers

In [378]:
train_df.iloc[70:76]

Unnamed: 0,index,word,ner_tag
70,28,advice,O
71,29,was,O
72,30,clearer,O
73,31,.,O
74,1,"""",O
75,2,We,O


In [379]:
train_df.head(15)

Unnamed: 0,index,word,ner_tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O
...,...,...,...
10,2,Blackburn,I-PER
11,1,BRUSSELS,B-LOC
12,2,1996-08-22,O
13,1,The,O


In [380]:
# Slight cleaning on num:
train_df["word"] = train_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)
dev_df["word"] = dev_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)
test_df["word"] = test_df["word"].str.replace(r'^\d+|.\d+$', "<num>", regex=True)

In [381]:
# Get the count of each word:
#word-type = word
cnt_d = {}
for row in train_df.iterrows():
    if row[1]["word"] in cnt_d:
        cnt_d[row[1]["word"]] += 1
    else:
        cnt_d[row[1]["word"]] = 1

In [382]:
threshold = 2 #No threshold = 1
#unknown_cnt = 0
unknown_word_lst = []   #We want to keep track of unknown words but group together
for k, v in cnt_d.items():
    if v < threshold:
        #unknown_cnt += v
        unknown_word_lst.append(k)
    else:
        continue

In [383]:
def replace_unk_train(word):
    if word in unknown_word_lst:
        return "<unk>"
    else:
        return word

In [384]:
def replace_unk_dev(word):
    if word in unknown_word_lst:
        return "<unk>"
    elif word not in train_words:
        return "<unk>"
    else:
        return word

In [385]:
#Replace with <unk> Train:
train_df["word"] = train_df["word"].apply(replace_unk_train)

In [386]:
train_df["word"] = train_df["word"].astype(str)
train_vocab_size = len(np.unique(train_df["word"]))
train_words = np.unique(train_df["word"])

In [387]:
#Replace with <unk> Dev:
dev_df["word"] = dev_df["word"].apply(replace_unk_dev)
dev_df["word"] = dev_df["word"].astype(str)

In [546]:
dev_df

Unnamed: 0,index,word,ner_tag
0,1,CRICKET,O
1,2,-,O
2,3,<unk>,B-ORG
3,4,TAKE,O
4,5,OVER,O
...,...,...,...
51573,1,--,O
51574,2,Dhaka,B-ORG
51575,3,Newsroom,I-ORG
51576,4,<num>-2<num>,O


In [567]:
#Format the data by sentences TRAIN:
def format_data(df):
    train_formatted = []
    #init beginning:
    first_word = df.iloc[0]
    sentence_x = [first_word["word"]]
    sentence_y = [first_word["ner_tag"]]
    
    for row in df.iloc[1:].iterrows():
        #print(row)
        if row[1]["index"] == 1:
            #print(row[1]["word"])
            train_formatted.append([sentence_x, sentence_y])

            sentence_x, sentence_y = [], []
            sentence_x.append(row[1]["word"])
            sentence_y.append(row[1]["ner_tag"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([sentence_x, sentence_y])
        else:
            sentence_x.append(row[1]["word"])
            sentence_y.append(row[1]["ner_tag"])
    return train_formatted

In [571]:
#Format the data by sentences TEST:
def format_data_test(df):
    test_formatted = []
    #init beginning:
    first_word = df.iloc[0]
    sentence_x = [first_word["word_formatted"]]
    
    for row in df.iloc[1:].iterrows():
        if row[1]["index"] == 1:
            test_formatted.append(sentence_x)

            sentence_x = []
            sentence_x.append(row[1]["word_formatted"])
            if row[0] == (df.shape[0]-1):
                train_formatted.append([sentence_x, sentence_y])
        else:
            sentence_x.append(row[1]["word_formatted"])
    
    return test_formatted

In [568]:
train_formatted = format_data(train_df)
dev_formatted = format_data(dev_df)

In [624]:
#Create Word Map for vocab:
word_map = {"<pad>":0}
for i, word in enumerate(set(train_df["word"])):
    word_map[word] = i+1
word_map

{'<pad>': 0,
 'WED': 1,
 'Piotti': 2,
 'Obilic': 3,
 'Leander': 4,
 'Feldhoff': 5,
 '+1<num>': 6,
 'researchers': 7,
 'alliance': 8,
 '<num>-0-14<num>': 9,
 'nomination': 10,
 'traffic': 11,
 'companies': 12,
 'soldier': 13,
 'remain': 14,
 'monitoring': 15,
 'bike': 16,
 'SHEFFIELD': 17,
 'Drobnjak': 18,
 'brains': 19,
 'Parliament': 20,
 'Chemical': 21,
 'tracks': 22,
 'Hlasek': 23,
 'safety': 24,
 'population': 25,
 'Ryutaro': 26,
 'Berasategui': 27,
 'Federal': 28,
 'Estates': 29,
 'Granic': 30,
 'examined': 31,
 'Hintsa': 32,
 'separatist': 33,
 'Borussia': 34,
 'understand': 35,
 'Glyn': 36,
 'regulation': 37,
 'COLOMBO': 38,
 'idea': 39,
 'recovered': 40,
 'registering': 41,
 'flooding': 42,
 'Srinath': 43,
 'did': 44,
 'Andy': 45,
 'Rwanda': 46,
 'brokers': 47,
 'guns': 48,
 'sold': 49,
 'conviction': 50,
 'Witschge': 51,
 'semifinal': 52,
 'Dean': 53,
 'Wednesday': 54,
 'holes': 55,
 'Sint': 56,
 'following': 57,
 'sex': 58,
 'Food': 59,
 'REASONS': 60,
 'Gloucestershire': 61,

In [626]:
#Create Word Map for ner_tag:
ner_map = {"<pad>":-1}
for i, word in enumerate(set(train_df["ner_tag"])):
    ner_map[word] = i
ner_map

{'<pad>': -1,
 'B-PER': 0,
 'I-ORG': 1,
 'I-PER': 2,
 'I-LOC': 3,
 'O': 4,
 'B-MISC': 5,
 'I-MISC': 6,
 'B-ORG': 7,
 'B-LOC': 8}

In [636]:
ner_map_without_pad = {}
for i, word in enumerate(set(train_df["ner_tag"])):
    ner_map_without_pad[word] = i
ner_map_without_pad

{'B-PER': 0,
 'I-ORG': 1,
 'I-PER': 2,
 'I-LOC': 3,
 'O': 4,
 'B-MISC': 5,
 'I-MISC': 6,
 'B-ORG': 7,
 'B-LOC': 8}

In [393]:
longest_train_sent = 0
for sentence in train_formatted:
    sentence_len = len(sentence[0])
    if sentence_len > longest_train_sent:
        longest_train_sent = sentence_len

In [524]:
longest_train_sent

113

In [627]:
#Map words in sentences to corresponding values:
def pad_sentences(sentences_formatted):
    train_padded = []
    cnt = 0
    for sentence in sentences_formatted:
        word_lst = sentence[0]
        ner_lst = sentence[1]
        mapped_word_lst, mapped_ner_lst = [], []
        cnt += len(word_lst)
        for word in word_lst:
            mapped_word_lst.append(word_map[word])
        for ner in ner_lst:
            mapped_ner_lst.append(ner_map[ner])

        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_
        mapped_ner_lst = mapped_ner_lst + [-1] * diff_

        train_padded.append([mapped_word_lst, mapped_ner_lst])
    print(cnt)
    return train_padded

In [628]:
train_padded = pad_sentences(train_formatted)
dev_padded = pad_sentences(dev_formatted)

204567
51578


In [605]:
#Map words in sentences to corresponding values:
def pad_test_sentences(sentences_formatted):
    test_padded = []
    for sentence in sentences_formatted:
        mapped_word_lst = []
        for word in sentence:
            mapped_word_lst.append(word_map[word])

        word_cnt = len(mapped_word_lst)
        diff_ = longest_train_sent - word_cnt
        mapped_word_lst = mapped_word_lst + [0] * diff_

        test_padded.append(mapped_word_lst)
    return test_padded

### Task 1: Simple Bidirectional LSTM Model:

In [406]:
len(dev_padded)

3465

In [400]:
len(train_padded)

14986

In [287]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn

In [673]:
class BLSTM(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        
        lstm_hidden_dim = 256
        lstm_num_layers = 1
        linear_output_dim =128
        output_dim = 10
        
        self.embeddings = nn.Embedding(vocab_size, 100)
        self.lstm = nn.LSTM(input_size=100, hidden_size=256,
                          num_layers=1, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.33)
        self.linear1 = nn.Linear(512, 128)
        self.linear2 = nn.Linear(128, 9)
        self.elu = nn.ELU()
        
    def forward(self, inputs):
        #print(inputs.shape)
        embeds = self.embeddings(inputs)
        #print(embeds.shape)
        lstm_out, self.hidden = self.lstm(embeds.view(len(inputs), 1, -1))
        lstm_out_dropped = self.dropout(lstm_out)
        out = self.linear1(lstm_out_dropped.view(len(inputs), -1))
        linear_out_dropped = self.dropout(out)
        #l2_out = self.linear2(linear_out_dropped)
        elu_out = self.elu(linear_out_dropped)
        l2_out = self.linear2(elu_out)
        log_probs = F.log_softmax(l2_out, dim=1)
        return log_probs


In [674]:
EMBEDDING_DIM = 100
VOCAB_SIZE = train_vocab_size+1 #added <pad> word
n_epochs = 5
trainloader = torch.utils.data.DataLoader(train_padded, batch_size=64, num_workers=1)
devloader = torch.utils.data.DataLoader(dev_padded, batch_size=64, num_workers=1)
blstm = BLSTM(VOCAB_SIZE)
criterion = nn.CrossEntropyLoss(ignore_index=-1, size_average=True) #therefore no need for softmax
#criterion = nn.NLLLoss()
# optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-4)
optimizer = torch.optim.SGD(blstm.parameters(), lr=0.1, momentum=0.9)

test_loss_min = 10000

for epoch in range(n_epochs):
    train_loss = 0
    test_loss = 0
    
    blstm.train()
    for data, target in trainloader:
        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
#         print(data)
#         print(target)
        #print(torch.cat(data,dim=0).reshape(1,400,316))
        #print(torch.cat(data,dim=0).size(0)) I think the problem is here.
        output = blstm(torch.cat(data,dim=0))

        # Compute loss
        loss = criterion(output, torch.cat(target,dim=0))

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

                # Print statistics
        #train_loss += loss.item()*torch.cat(data,dim=0).size(0)
        train_loss += loss
        
    with torch.no_grad():
        for data, target in devloader:
            output = blstm(torch.cat(data,dim=0))
            loss = criterion(output, torch.cat(target,dim=0))
#             test_loss += loss.item()*torch.cat(data,dim=0).size(0)
            test_loss += loss
    train_loss = train_loss/len(trainloader.dataset)
    test_loss = test_loss/len(devloader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tTest Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        test_loss
        ))
    
    if test_loss <= test_loss_min:
        torch.save(blstm.state_dict(), 'blstm1.pt')
        test_loss_min = test_loss

  # Process is complete.
print('All done.')

Epoch: 1 	Training Loss: 0.013137 	Test Loss: 0.012251
Epoch: 2 	Training Loss: 0.011642 	Test Loss: 0.011503
Epoch: 3 	Training Loss: 0.010595 	Test Loss: 0.010200
Epoch: 4 	Training Loss: 0.009587 	Test Loss: 0.009344
Epoch: 5 	Training Loss: 0.008823 	Test Loss: 0.008757
All done.


In [491]:
#Format the test_data:
test_df["word_formatted"] = test_df["word"].apply(replace_unk_dev)
test_df["word_formatted"] = test_df["word_formatted"].astype(str)

test_formatted = format_data_test(test_df)
test_padded = pad_test_sentences(test_formatted)

In [675]:
blstm.load_state_dict(torch.load('blstm1.pt'))
testloader = torch.utils.data.DataLoader(test_padded[0:100], batch_size=1, num_workers=1)

In [497]:
# Calculate Accuracy from trained model:
def predict_test(model, dataloader):
    prediction_list = []
    with torch.no_grad():
        for data in dataloader:
            output = model(torch.cat(data,dim=0))
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    return prediction_list

In [496]:
# Calculate Accuracy from trained model:
def predict(model, dataloader):
    prediction_list = []
    with torch.no_grad():
        for data, target in dataloader:
            output = model(torch.cat(data,dim=0))
            _, predicted = torch.max(output.data, 1) 
            prediction_list.append(predicted)
    return prediction_list

In [498]:
def unravel_predictions_test(data, pred):
    overall_pred = []
    for i, sentence in enumerate(data):
        non_padded_pred = len(np.nonzero(sentence)[0])
        pred_i = pred[i].tolist()[0:non_padded_pred]
        overall_pred.append(pred_i)
    return overall_pred

In [523]:
def unravel_predictions(data, pred):
    overall_pred = []
    for i, sentence in enumerate(data):
        actual_sentence = sentence[0]
        non_padded_pred = len(np.nonzero(actual_sentence)[0])
        pred_i = pred[i].tolist()[0:non_padded_pred]
        overall_pred.append(pred_i)
    return overall_pred

In [637]:
def convert_predictions(pred):
    overall_pred = []
    for sentence in pred:
        for idx in sentence:
            overall_pred.append(list(ner_map_without_pad.keys())[idx])
    return overall_pred

In [501]:
def accuracy(y_true, y_pred):
    score = sum(y_true == y_pred)/len(y_pred)
    return score

In [676]:
#predict on dev:
devloader = torch.utils.data.DataLoader(dev_padded, batch_size=1, num_workers=1) #need to do 1 at a time:
predictions_dev = predict(blstm, devloader)
predictions_dev = unravel_predictions(dev_padded, predictions_dev)
predictions_dev = convert_predictions(predictions_dev)

In [669]:
predictions_dev

['I-LOC',
 'O',
 'I-ORG',
 'I-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'B-LOC',
 'O',
 'B-LOC',
 'O',
 'B-MISC',
 'O',
 'O',
 'B-LOC',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'O',
 'B-MISC',
 'I-PER',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'B-LOC',
 'O',
 'B-ORG',
 'B-LOC',
 'O',
 'B-MISC',
 'O',
 'O',
 'B-MISC',
 'I-LOC',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'I-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'B-ORG',
 'O',
 'I-ORG',
 'B-PER',
 'O',
 'B-ORG',
 'B-ORG',
 'O',
 'O',
 'O',
 'B-PER',
 'B-ORG',
 'B-LOC',
 'O',
 'O',
 'O',
 'I-LOC',
 'B-LOC',
 'O',
 'B-LOC',
 'I-LOC',
 'O',
 'O',
 'O',
 'B-MISC',
 'B-ORG',
 'B-MISC',
 'B-LOC',
 'B-ORG',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'B-PER',
 'O',
 'B-MISC',
 'B-LOC',
 'O',
 'B-PER',
 'B-ORG',
 'O',
 'B-MISC',
 'B-PER',
 'B-LOC',
 'I-PER',
 'O',
 'B-ORG',
 'O',
 'B-ORG',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'B-MISC',
 'B-LOC',
 'B-LOC',
 'O',
 'B-PER',
 'O',
 'O',
 'I-LOC',
 'O',
 'O',
 'B-MISC',
 '

In [677]:
y_true = np.array(dev_df["ner_tag"])
print(len(y_true))
print(len(predictions_dev))
print("Dev Accuracy:", accuracy(y_true, predictions_dev))

51578
51578
Dev Accuracy: 0.8618209314048625


In [678]:
def write_results(name, y_true, y_pred, df):
    with open(name, 'w') as f:
        for row in df.iloc[0:].iterrows():
            f.write(str(row[1]["index"]))
            f.write(" ")
            f.write(row[1]["word"])
            f.write(" ")
            f.write(y_true[row[0]])
            f.write(" ")
            f.write(y_pred[row[0]])
            f.write("\n")

In [679]:
write_results("dev.out", y_true, predictions_dev, dev_df)

In [504]:
#predict on test:
predictions = predict_test(blstm, testloader)
predictions = unravel_predictions_test(test_padded[0:100], predictions)
predictions = convert_predictions(predictions)

In [518]:
len(predictions)

1395

In [469]:
ner_map.keys()

dict_keys(['<pad>', 'B-PER', 'I-ORG', 'I-PER', 'I-LOC', 'O', 'B-MISC', 'I-MISC', 'B-ORG', 'B-LOC'])

In [470]:
list(ner_map.keys())

['<pad>',
 'B-PER',
 'I-ORG',
 'I-PER',
 'I-LOC',
 'O',
 'B-MISC',
 'I-MISC',
 'B-ORG',
 'B-LOC']

In [472]:
convert_predictions(predictions[1:2])

['O', 'O']

In [439]:
predictions[0].tolist()

[5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 0,
 5,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [331]:
predictions

[tensor([4, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([5, 4, 8, 5, 8, 8, 5, 8, 5, 8, 5, 5, 8, 5, 8, 8, 5, 5, 5, 8, 8, 5, 8, 8,
         8, 5, 4, 5, 8, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [323]:
d = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
len(d)

113