## Download dataset from Google Drive

In [None]:
# Code to download file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
id = '1fto6uicrRohYAp-Yo9gAAf_27t2btCp6'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('2020-comp5046-a2.zip')
!unzip -d /content/ /content/2020-comp5046-a2.zip > /dev/null

In [None]:
!cat train.csv | wc -l
!cat val.csv | wc -l
!cat test.csv | wc -l
!head -n 10 test.csv
!head -n 10 sample\ submission.csv

3001
701
3685
Sentence,NER
-docstart-,
"soccer - japan get lucky win , china in surprise defeat .",
nadim ladki,
"al-ain , united arab emirates 1996-12-06",
japan began the defence of their asian cup title with a lucky 2-1 win against syria in a group c championship match on friday .,
"but china saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers uzbekistan .",
china controlled most of the match and saw several chances missed until the 78th minute when uzbek striker igor shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing chinese keeper and into an empty net .,
"oleg shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area .",
the former soviet republic was playing in an asian cup finals tie for the first time .,
Id,Predicted
0,O
1,O
2,O
3,O
4,O
5,O
6,O
7,O
8,O


In [None]:
import pandas as pd
train_df = pd.read_csv('train.csv',header=0,sep=',')
val_df = pd.read_csv('val.csv',header=0,sep=',')
test_df = pd.read_csv('test.csv',header=0,sep=',')
train_df

Unnamed: 0,Sentence,NER
0,-docstart-,O
1,eu rejects german call to boycott british lamb .,I-ORG O I-MISC O O O I-MISC O O
2,peter blackburn,I-PER I-PER
3,brussels 1996-08-22,I-LOC O
4,the european commission said on thursday it di...,O I-ORG I-ORG O O O O O O I-MISC O O O O O I-M...
...,...,...
2995,hovercrafts will soon be plying the waters of ...,O O O O O O O O O I-LOC O O O O O O O O O O O ...
2996,"two russian-built hovercrafts , capable of car...",O I-MISC O O O O O O O O O O O O O O O O O O O...
2997,the use of riverways in the region has been ma...,O O O O O O O O O O O O O O O O O O I-LOC O O ...
2998,-docstart-,O


In [None]:
def read_data(sentences,ners=None):
  input_data = [sent.split(' ') for sent in sentences]
  if ners:
    target_data = [ner.split(' ') for ner in ners]
  else:
    target_data = None
  return input_data,target_data

train_data, target_y_train = read_data(train_df.Sentence.to_list(),train_df.NER.to_list())
validation_data, target_y_validation = read_data(val_df.Sentence.to_list(),val_df.NER.to_list())
test_data,_ = read_data(test_df.Sentence.to_list())

In [None]:
word_to_ix = {}
for sentence in train_data+validation_data+test_data:
    for word in sentence:
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

## Word2vec embedding

In [None]:
import gensim.downloader as api
word_emb_model = api.load("glove-wiki-gigaword-300") 



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
import numpy as np
EMBEDDING_DIM = word_emb_model.vector_size
oov = np.random.uniform(-0.25, 0.25, EMBEDDING_DIM).round(6)
word_embedding_matrix = []
for word in word_list:
    try:
        word_embedding_matrix.append(word_emb_model.get_vector(word))
    except:
        word_embedding_matrix.append(oov)
word_embedding_matrix = np.array(word_embedding_matrix)
word_embedding_matrix.shape

(13972, 300)

In [None]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)

## PoS Tag

In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
nltk.download('averaged_perceptron_tagger')
nltk.download('treebank')
nltk.download('universal_tagset')
nltk.download('brown')
from nltk.corpus import treebank,brown

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
train_tag = [[tag for _,tag in nltk.pos_tag(sentence)] for sentence in train_data]
validation_tag = [[tag for _,tag in nltk.pos_tag(sentence)] for sentence in validation_data]
test_tag = [[tag for _,tag in nltk.pos_tag(sentence)] for sentence in test_data]
tag_list =set([tag for tags in train_tag + validation_tag + test_tag for tag in tags])

pos_tag_to_idx = {t: i for i, t in enumerate(list(tag_list))}

In [None]:
import torch.nn as nn
class PoSTagger(nn.Module):
    def __init__(self, pos_tag_idx):
        super(PoSTagger, self).__init__()
        self.tag_2_index = pos_tag_idx
    def tag_to_one_hot(self,tag):
        one_hot = np.zeros(len(self.tag_2_index))
        one_hot[self.tag_2_index[tag]] = 1
        return one_hot
    
    def forward(self,x):
        x = x.cpu().numpy().tolist()
        x = [word_list[idx] for idx in x]
        pos_tag = [tag for _,tag in nltk.pos_tag(x)]
        pos_tag_one_hot = np.array([self.tag_to_one_hot(tag) for tag in pos_tag])
        return torch.from_numpy(pos_tag_one_hot).float()

## Character embedding

In [None]:
char_arr = list(set([char for words in train_data + validation_data + test_data for char in ''.join(words)]))
char_arr.sort()
# one-hot encoding and decoding 
num_dic = {n: i+1 for i, n in enumerate(char_arr)}
num_dic['P'] = 0 #encoding for padding
dic_len = len(num_dic)

In [None]:
from collections import Counter
word_length_cnt=Counter(map(len,word_list)).most_common()
word_length_cnt.sort()
total_amt=0
ratio=0.9
for idx,(length,amt) in enumerate(word_length_cnt):
  if total_amt/len(word_list)<=ratio:
    total_amt += amt
  else:
    break
max_word_len = idx
max_word_len

10

In [None]:
def add_padding(word):
  if len(word)>=max_word_len:
    return word[:max_word_len]
  else:
    return word+'P'*(max_word_len-len(word))
def make_batch(seq_data):
    input_batch = []
    target_batch = []
    
    for seq in seq_data:
        after_padding=add_padding(seq)
        input_data = [num_dic[n] for n in after_padding]
        target = word_embedding_matrix[seq_data.index(seq)]
        # convert input to one-hot encoding.
        # if input is [3, 4, 4]:
        # [[ 0,  0,  0,  1,  0,  0,  0, ... 0]
        #  [ 0,  0,  0,  0,  1,  0,  0, ... 0]
        #  [ 0,  0,  0,  0,  1,  0,  0, ... 0]]
        input_batch.append(np.eye(dic_len)[input_data])
        
        target_batch.append(target)

    return input_batch, target_batch

In [None]:
# setting hyperparameters
# from previous experience, learning_rate more than 0.01 may result in a big loss

learning_rate = 0.01
n_hidden = 100
total_epoch = 500
n_input = dic_len
n_class = word_embedding_matrix.shape[1]

In [None]:
import torch.nn.functional as F
import torch
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(n_input, n_hidden, batch_first =True,bidirectional=True, dropout=0.2)
        self.linear = nn.Linear(n_hidden*2,n_class)

    def forward(self, sentence):
        
        #h_n of shape (num_layers * num_directions, batch, hidden_size): tensor containing the hidden state for t = seq_len.
        lstm_out, (h_n,c_n) = self.lstm(sentence)
        #concat the last hidden state from two direction
        hidden_out =torch.cat((h_n[0,:,:],h_n[1,:,:]),1)
        z = self.linear(hidden_out)
        log_output = F.log_softmax(z, dim=1)
        return log_output,hidden_out


In [None]:
# Preparing input
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_batch, target_batch = make_batch(word_list)
# Convert input into tensors and move them to GPU by uting tensor.to(device)
input_batch_torch = torch.from_numpy(np.array(input_batch)).float().to(device)
target_batch_torch = torch.from_numpy(np.array(target_batch)).float().to(device)

In [None]:
# Move the model to GPU
net = Net().to(device)
# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
for epoch in range(total_epoch):  
    
    # Set the flag to training
    net.train()
    # forward + backward + optimize
    outputs,_ = net(input_batch_torch) 
    loss = criterion(outputs, target_batch_torch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    # Set the flag to evaluation, which will 'turn off' the dropout
    net.eval()
    outputs,_ = net(input_batch_torch) 
    # Evaluation loss and accuracy calculation
    loss = criterion(outputs, target_batch_torch)
    if epoch % 20 == 19:
      print('Epoch: %d, loss: %.5f' %(epoch + 1, loss.item()))

print('Finished Training')

In [None]:
torch.save(net,'char_embedding_model.pt')

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
id = '1DwHh0Zcvrs28q3vVvJcilc6coUP-ZSJq'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('char_embedding_model.pt')
import torch
net = torch.load('char_embedding_model.pt')

_,hidden_state = net(input_batch_torch)
char_embedding_matrix = hidden_state.data
char_embedding_matrix.shape

torch.Size([13972, 200])

## Model

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    ATTN_TYPE_DOT_PRODUCT = "Dot Product"
    ATTN_TYPE_SCALE_DOT_PRODUCT = "Scale Dot Product"


    def __init__(self, vocab_size, tag_to_ix, pos_tag_idx, hidden_dim,config):
        super(BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.nlayers = config['nlayers']
        self.attn_method = config['attn_method']
        self.use_char = config['use_char']
        self.use_pos = config['use_pos']

        self.word_embeds = nn.Embedding(vocab_size, word_embedding_matrix.shape[1])
        self.char_embeds = nn.Embedding(vocab_size,char_embedding_matrix.shape[1])
        self.pos_tagger = PoSTagger(pos_tag_idx)
        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(word_embedding_matrix))
        self.char_embeds.weight.data.copy_(char_embedding_matrix)
        self.embedding_dim = word_embedding_matrix.shape[1]
        if config['use_char']:
            self.embedding_dim += char_embedding_matrix.shape[1]
        if config['use_pos']:
            self.embedding_dim += len(pos_tag_idx)

        self.lstm = nn.LSTM(self.embedding_dim
                            , hidden_dim // 2,
                            num_layers=self.nlayers,dropout=0.3, bidirectional=True)
        self.dropout = nn.Dropout(p=0.3)
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
        

    def init_hidden(self):
        return (torch.randn(2 * self.nlayers, 1, self.hidden_dim // 2).to(device),
                torch.randn(2 * self.nlayers, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def cal_attention(self, q, k, method):
        if method == BiLSTM_CRF.ATTN_TYPE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(q, k.permute(1,2,0)),dim=-1)
            attn_output = torch.bmm(attn_weights, k.permute(1,0,2))
            concat_output = torch.cat((attn_output[0], q[0]), 1)
        elif method == BiLSTM_CRF.ATTN_TYPE_SCALE_DOT_PRODUCT:
            attn_weights = F.softmax(torch.bmm(q, k.permute(1,2,0)) / np.power(k.shape[2],0.5),dim=-1)
            attn_output = torch.bmm(attn_weights, k.permute(1,0,2))
            concat_output = torch.cat((attn_output[0], q[0]), 1)

            
        return concat_output

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        stacked_embeds = []
        word2vec_embeds = self.word_embeds(sentence)
        stacked_embeds.append(word2vec_embeds)
        if self.use_char:
            character_embeds = self.char_embeds(sentence)
            stacked_embeds.append(character_embeds)
        if self.use_pos:
            pos_tag_embeds = self.pos_tagger(sentence)
            stacked_embeds.append(pos_tag_embeds)
        # tfidf_embeds = self.tfidf(sentence)
        if len(stacked_embeds) >= 2:
            embeds = torch.cat(tuple(stacked_embeds),1).view(len(sentence), 1, -1)
        else:
            embeds = stacked_embeds[0].view(len(sentence), 1, -1)
        embeds = self.dropout(embeds)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        # lstm_out = lstm_out.permute(1,0,2)
        # h_n = torch.cat((h_n[-2,:,:],h_n[-1,:,:]),1).unsqueeze(0)
        # attn_output,_ = self.attention(lstm_out,h_n)
        lstm_out = lstm_out[:,:,:self.hidden_dim // 2] + lstm_out[:,:,self.hidden_dim // 2:]

        attn_output = torch.zeros(len(sentence), self.hidden_dim, device=device)

        for i in range(len(sentence)):
            query = lstm_out[i]
            concat_output = self.cal_attention(query.unsqueeze(0), lstm_out, self.attn_method)
            attn_output[i] = concat_output
        attn_output = attn_output.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(attn_output)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        # print('output shape',feats.shape)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

### Function for accuracy

In [None]:
import numpy as np
from sklearn.metrics import f1_score
def cal_f1(model,input_index,output_index):
  ground_truth = list()
  predicted = list()
  for train_idx,target in zip(input_index,output_index):
    
    input_torch = torch.tensor(train_idx,dtype=torch.long).to(device)
    _,output = model(input_torch)
    predicted += output
    ground_truth += target
  f1 = f1_score(ground_truth,predicted,average='micro')

  return ground_truth, predicted, f1

def predict(model,input_index):
   predicted = list()
   for train_idx in input_index:
     input_torch = torch.tensor(train_idx,dtype=torch.long).to(device)
     _,output = model(input_torch)
     predicted += output
   return predicted

### Initialize Model

In [None]:
config = {
    'nlayers':2,
    'use_char':True,
    'use_pos':True,
    'attn_method':'Scale Dot Product'
}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 150
learning_rate = 0.05
model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, pos_tag_to_idx, HIDDEN_DIM,config).to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1)

### Train the model

In [None]:
logs = dict(
    epoch=[],
    train_loss=[],
    val_loss=[],
    train_f1=[],
    val_f1=[],
    nlayers=[],
    use_char=[],
    use_pos=[],
    attn_method=[],
)

In [None]:
from tqdm import tqdm
import datetime
epochs = 20
print(config)
for epoch in range(epochs):  
    time1 = datetime.datetime.now()
    train_loss = 0
    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        # print('input shape',sentence_in.shape)
        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)
        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    # lr_scheduler.step()
    model.eval()
    _, _, train_f1 = cal_f1(model,train_input_index,train_output_index)
    _, _, val_f1 = cal_f1(model,val_input_index,val_output_index)
    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss += loss.item()
    time2 = datetime.datetime.now()
     
    logs['epoch'].append(epoch+1)
    logs['train_loss'].append(train_loss)
    logs['val_loss'].append(val_loss)
    logs['train_f1'].append(train_f1)
    logs['val_f1'].append(val_f1)
    logs['nlayers'].append(config['nlayers'])
    logs['use_char'].append(config['use_char'])
    logs['use_pos'].append(config['use_pos'])
    logs['attn_method'].append(config['attn_method'])

    print("Epoch:%d, Training loss: %.2f, train f1: %.4f, val loss: %.2f, val f1: %.4f, time: %.2fs" %(epoch+1, train_loss,train_f1, val_loss, val_f1, (time2-time1).total_seconds()))


In [None]:
import pandas as pd
df = pd.DataFrame(logs)

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.epoch, y=df.train_loss,mode='lines',name='training loss')) 
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.epoch, y=df.val_loss,mode='lines',name='validation loss')) 
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.epoch, y=df.train_f1,mode='lines',name='training f1 score')) 
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.epoch, y=df.val_f1,mode='lines',name='validation f1 score')) 
fig.show()

## Hyperparameter tuning

In [None]:
combinations = [
[2,False,False,'Dot Product'],
[2,True,False,'Dot Product'],
]
logs = dict(
    epoch=[],
    train_loss=[],
    val_loss=[],
    train_f1=[],
    val_f1=[],
    nlayers=[],
    use_char=[],
    use_pos=[],
    attn_method=[],
)

In [None]:
import datetime
for p in combinations:
    config = dict(zip(['nlayers','use_char','use_pos','attn_method'],p))
    print(config)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    HIDDEN_DIM = 150
    learning_rate = 0.05
    model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, pos_tag_to_idx, HIDDEN_DIM,config).to(device)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=10,gamma=0.1)

    epochs = 20
    print(config)
    for epoch in range(epochs):  
        time1 = datetime.datetime.now()
        train_loss = 0
        model.train()
        for i, idxs in enumerate(train_input_index):
            tags_index = train_output_index[i]
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()
            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            # print('input shape',sentence_in.shape)
            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)
            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        lr_scheduler.step()
        model.eval()
        _, _, train_f1 = cal_f1(model,train_input_index,train_output_index)
        _, _, val_f1 = cal_f1(model,val_input_index,val_output_index)
        val_loss = 0
        for i, idxs in enumerate(val_input_index):
            tags_index = val_output_index[i]
            sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = model.neg_log_likelihood(sentence_in, targets)
            val_loss += loss.item()
        time2 = datetime.datetime.now()

        logs['epoch'].append(epoch+1)
        logs['train_loss'].append(train_loss)
        logs['val_loss'].append(val_loss)
        logs['train_f1'].append(train_f1)
        logs['val_f1'].append(val_f1)
        logs['nlayers'].append(config['nlayers'])
        logs['use_char'].append(config['use_char'])
        logs['use_pos'].append(config['use_pos'])
        logs['attn_method'].append(config['attn_method'])

        print("Epoch:%d, Training loss: %.2f, train f1: %.4f, val loss: %.2f, val f1: %.4f, time: %.2fs" %(epoch+1, train_loss,train_f1, val_loss, val_f1, (time2-time1).total_seconds()))
    torch.save(model,'bilstm-crf_{}'.format(str(config)))

{'nlayers': 2, 'use_char': False, 'use_pos': False, 'attn_method': 'Dot Product'}
{'nlayers': 2, 'use_char': False, 'use_pos': False, 'attn_method': 'Dot Product'}
Epoch:1, Training loss: 13124.55, train f1: 0.9361, val loss: 1485.18, val f1: 0.9247, time: 226.71s
Epoch:2, Training loss: 5493.02, train f1: 0.9612, val loss: 1159.76, val f1: 0.9418, time: 231.86s
Epoch:3, Training loss: 3966.42, train f1: 0.9678, val loss: 1188.08, val f1: 0.9452, time: 229.27s
Epoch:4, Training loss: 2830.68, train f1: 0.9731, val loss: 1195.75, val f1: 0.9513, time: 226.53s
Epoch:5, Training loss: 2376.78, train f1: 0.9818, val loss: 1273.66, val f1: 0.9545, time: 224.97s
Epoch:6, Training loss: 1916.87, train f1: 0.9816, val loss: 1184.75, val f1: 0.9582, time: 227.15s
Epoch:7, Training loss: 1581.11, train f1: 0.9875, val loss: 1081.51, val f1: 0.9615, time: 227.84s
Epoch:8, Training loss: 1374.28, train f1: 0.9844, val loss: 1268.17, val f1: 0.9550, time: 227.24s
Epoch:9, Training loss: 1111.09, tr


Couldn't retrieve source code for container of type BiLSTM_CRF. It won't be checked for correctness upon loading.


Couldn't retrieve source code for container of type PoSTagger. It won't be checked for correctness upon loading.



{'nlayers': 2, 'use_char': True, 'use_pos': False, 'attn_method': 'Dot Product'}
Epoch:1, Training loss: 12480.52, train f1: 0.9388, val loss: 1525.73, val f1: 0.9284, time: 257.43s
Epoch:2, Training loss: 5198.79, train f1: 0.9610, val loss: 1110.91, val f1: 0.9457, time: 256.46s
Epoch:3, Training loss: 3663.09, train f1: 0.9651, val loss: 1253.01, val f1: 0.9464, time: 257.05s
Epoch:4, Training loss: 2718.10, train f1: 0.9801, val loss: 1031.46, val f1: 0.9562, time: 257.12s
Epoch:5, Training loss: 2061.73, train f1: 0.9821, val loss: 1071.90, val f1: 0.9567, time: 257.00s
Epoch:6, Training loss: 1609.68, train f1: 0.9870, val loss: 1122.02, val f1: 0.9625, time: 256.58s
Epoch:7, Training loss: 1413.68, train f1: 0.9855, val loss: 1159.78, val f1: 0.9627, time: 258.31s
Epoch:8, Training loss: 1286.19, train f1: 0.9912, val loss: 1064.41, val f1: 0.9668, time: 263.43s
Epoch:9, Training loss: 1032.25, train f1: 0.9929, val loss: 1105.16, val f1: 0.9647, time: 260.74s
Epoch:10, Training

## Testing

In [None]:
y_pred= predict(model,test_input_index)

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

# y_true_decode = decode_output(y_true)
y_pred_decode = decode_output(y_pred)

In [None]:
submission_df = pd.DataFrame(dict(Id=range(len(y_pred_decode)),Predicted=y_pred_decode))

In [None]:
submission_df.to_csv('submission.csv',index=False)