In [21]:
import torch.utils.data as data
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
from torch.optim.lr_scheduler import StepLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
## Reading the train data
train_data=[]
filePath="train"
        
with open(filePath, "r") as file:
    for x in file:
        x=x.rstrip()
        train_data.append(x.split(" "))
        
train_words=list()
temp=[]
for i in train_data:
    if len(i)<2:
        train_words.append(temp)
        temp=[]
    else:
        temp.append(i[1])
        
t_words=set()
for i in train_data:
    if len(i)>1:
        t_words.add(i[1])
        
t_tags=set()
for i in train_data:
    if len(i)>1:
        t_tags.add(i[2])
c=2      
train_word_idx={}
for i in t_words:
    train_word_idx[i]=c
    c+=1
    
train_word_idx['<PAD>'] = 0
train_word_idx['<UNK>'] = 1  


train_ner_tag=list()
temp=[]
for i in train_data:
    if len(i)<2:
        train_ner_tag.append(temp)
        temp=[]
    else:
        temp.append(i[2])

        
c=1      
train_label_idx={}
for i in t_tags:
    train_label_idx[i]=c
    c+=1
    
train_label_idx['<PAD>'] = 0  

In [23]:
train_data

[['1', 'EU', 'B-ORG'],
 ['2', 'rejects', 'O'],
 ['3', 'German', 'B-MISC'],
 ['4', 'call', 'O'],
 ['5', 'to', 'O'],
 ['6', 'boycott', 'O'],
 ['7', 'British', 'B-MISC'],
 ['8', 'lamb', 'O'],
 ['9', '.', 'O'],
 [''],
 ['1', 'Peter', 'B-PER'],
 ['2', 'Blackburn', 'I-PER'],
 [''],
 ['1', 'BRUSSELS', 'B-LOC'],
 ['2', '1996-08-22', 'O'],
 [''],
 ['1', 'The', 'O'],
 ['2', 'European', 'B-ORG'],
 ['3', 'Commission', 'I-ORG'],
 ['4', 'said', 'O'],
 ['5', 'on', 'O'],
 ['6', 'Thursday', 'O'],
 ['7', 'it', 'O'],
 ['8', 'disagreed', 'O'],
 ['9', 'with', 'O'],
 ['10', 'German', 'B-MISC'],
 ['11', 'advice', 'O'],
 ['12', 'to', 'O'],
 ['13', 'consumers', 'O'],
 ['14', 'to', 'O'],
 ['15', 'shun', 'O'],
 ['16', 'British', 'B-MISC'],
 ['17', 'lamb', 'O'],
 ['18', 'until', 'O'],
 ['19', 'scientists', 'O'],
 ['20', 'determine', 'O'],
 ['21', 'whether', 'O'],
 ['22', 'mad', 'O'],
 ['23', 'cow', 'O'],
 ['24', 'disease', 'O'],
 ['25', 'can', 'O'],
 ['26', 'be', 'O'],
 ['27', 'transmitted', 'O'],
 ['28', 'to', '

In [24]:
train_label_idx

{'B-PER': 1,
 'I-MISC': 2,
 'I-LOC': 3,
 'B-LOC': 4,
 'I-PER': 5,
 'O': 6,
 'B-MISC': 7,
 'I-ORG': 8,
 'B-ORG': 9,
 '<PAD>': 0}

In [25]:
train_word_idx

{'cable': 2,
 'replacements': 3,
 '1-96': 4,
 'cop': 5,
 'cheques': 6,
 'partly': 7,
 'courts': 8,
 'Hang': 9,
 'Boutros-Ghali': 10,
 'accident': 11,
 'Trade': 12,
 '876': 13,
 'taken': 14,
 'Chiquinho': 15,
 'Petrobulk': 16,
 'puts': 17,
 '14,196': 18,
 'natural': 19,
 'conquering': 20,
 'RESULTS': 21,
 '146.2': 22,
 'Konan': 23,
 '340': 24,
 'Hendriks': 25,
 'sentences': 26,
 'enable': 27,
 'Trnava': 28,
 'patrolled': 29,
 'UGANDA': 30,
 'paratroop': 31,
 'Opera': 32,
 'Cambodian': 33,
 'findings': 34,
 'Beaver': 35,
 'Dondo': 36,
 'prisoner': 37,
 'Fortuna': 38,
 'GROWTH': 39,
 '1:52.318': 40,
 'Arulanandam': 41,
 'doomed': 42,
 'ever': 43,
 'der': 44,
 'Amsterdam-Rotterdam': 45,
 'YEAR': 46,
 'TAX': 47,
 'COPENHAGEN': 48,
 '377': 49,
 '8.05': 50,
 'TIMES': 51,
 'murders': 52,
 'supermodel': 53,
 'commitments': 54,
 'structures': 55,
 '125CC': 56,
 'freedom': 57,
 'Therese': 58,
 'carpenter': 59,
 'Walsall': 60,
 'Escravos': 61,
 'devote': 62,
 'Keegan': 63,
 'degrading': 64,
 'Mist

In [26]:
t_words

{'cable',
 'replacements',
 '1-96',
 'cop',
 'cheques',
 'partly',
 'courts',
 'Hang',
 'Boutros-Ghali',
 'accident',
 'Trade',
 '876',
 'taken',
 'Chiquinho',
 'Petrobulk',
 'puts',
 '14,196',
 'natural',
 'conquering',
 'RESULTS',
 '146.2',
 'Konan',
 '340',
 'Hendriks',
 'sentences',
 'enable',
 'Trnava',
 'patrolled',
 'UGANDA',
 'paratroop',
 'Opera',
 'Cambodian',
 'findings',
 'Beaver',
 'Dondo',
 'prisoner',
 'Fortuna',
 'GROWTH',
 '1:52.318',
 'Arulanandam',
 'doomed',
 'ever',
 'der',
 'Amsterdam-Rotterdam',
 'YEAR',
 'TAX',
 'COPENHAGEN',
 '377',
 '8.05',
 'TIMES',
 'murders',
 'supermodel',
 'commitments',
 'structures',
 '125CC',
 'freedom',
 'Therese',
 'carpenter',
 'Walsall',
 'Escravos',
 'devote',
 'Keegan',
 'degrading',
 'Mister',
 'TEGUCIGALPA',
 'Tacis',
 'roads',
 'Sporting',
 'Available',
 'hinting',
 'Jenson',
 'expectations',
 'Gilbert',
 'Topolcany',
 'Khmer',
 'contributed',
 'Mideast',
 '2-98',
 'resumed',
 '605M',
 'Indiana',
 'evaluates',
 '246,077',
 'Ji

In [27]:
train_words

[['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'],
 ['Peter', 'Blackburn'],
 ['BRUSSELS', '1996-08-22'],
 ['The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  'shun',
  'British',
  'lamb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.'],
 ['Germany',
  "'s",
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'s",
  'veterinary',
  'committee',
  'Werner',
  'Zwingmann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheepmeat',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 ['"',
  'We',
  'do',
  "n't",
  'support',
  'any',
  'such',
  'recommendation',
  'because',
  'we',
  'do',
  "n't",
  'see',
  'any',
  'grounds',
  'fo

In [28]:
train_ner_tag

[['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'],
 ['B-PER', 'I-PER'],
 ['B-LOC', 'O'],
 ['O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-LOC',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
 

In [29]:
embedding_dim = 100
lstm_hidden_dim = 256
lstm_layers = 1
lstm_dropout = 0.33
linear_output_dim = 128
batch_size = 16
learning_rate = 0.05
num_epochs = 10
# Define the dataset class
class NERDataset(data.Dataset):
    def __init__(self, filename, max_len):
        self.sentences = []
        self.tags = []
        with open(filename, 'r', encoding='utf-8') as f:
            words = []
            ner_tags = []
            for line in f:
                if line == '\n':
                    self.sentences.append(words)
                    self.tags.append(ner_tags)
                    words = []
                    ner_tags = []
                else:
                    items = line.strip().split()
                    word = items[1]
                    ner_tag = items[2]
                    words.append(word)
                    ner_tags.append(ner_tag)
        self.word2idx = {word: i+2 for i, word in enumerate(set([word for sentence in self.sentences for word in sentence]))}
        self.word2idx['<PAD>'] = 0
        self.word2idx['<UNK>'] = 1
        self.tag2idx = {tag: i+1 for i, tag in enumerate(set([tag for tags in self.tags for tag in tags]))}
        self.max_len = max_len
    
    def __getitem__(self, index):
        x = [self.word2idx.get(word, 1) for word in self.sentences[index]]
        y = [self.tag2idx[tag] for tag in self.tags[index]]
        x = x + [0] * (self.max_len - len(x))  # Pad the sequence with zeros
        y = y + [0] * (self.max_len - len(y))  # Pad the sequence with zeros
        return torch.LongTensor(x), torch.LongTensor(y)
    
    def __len__(self):
        return len(self.sentences)



# Define the train dataset and data loader
train_dataset = NERDataset('train',max_len=113)
dev_dataset = NERDataset('dev',max_len=113)
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = data.DataLoader(dev_dataset, batch_size=batch_size, drop_last=True)

# Define the number of words and classes
num_words = len(train_dataset.word2idx) + 1
num_classes = len(train_dataset.tag2idx) + 1

In [30]:
num_words

23627

In [31]:
num_classes

10

In [32]:
train_dataset[1]

(tensor([ 8873, 13591,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 tensor([1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 

In [33]:
%%time
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score

# Define the hyperparameters
embedding_dim = 100
lstm_hidden_dim = 256
lstm_layers = 2
lstm_dropout = 0.33
linear_output_dim = 128
batch_size = 16
learning_rate = 0.05
num_epochs = 10

# Define the model architecture
class BLSTM(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden_dim, lstm_layers, lstm_dropout, linear_output_dim):
        super(BLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.lstm_hidden_dim = lstm_hidden_dim
        self.lstm_layers = lstm_layers
        self.lstm_dropout = lstm_dropout
        self.linear_output_dim = linear_output_dim
        
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.lstm_hidden_dim, num_layers=self.lstm_layers, dropout=self.lstm_dropout, bidirectional=True)
        self.linear = nn.Linear(self.lstm_hidden_dim*2, self.linear_output_dim)
        self.activation = nn.ELU()
        self.classifier = nn.Linear(self.linear_output_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(x)
        x = self.activation(x)
        x = self.classifier(x)
        return x





CPU times: user 28 µs, sys: 16 µs, total: 44 µs
Wall time: 46 µs


In [34]:
word_to_idx = {}
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = 1 # Add <UNK> key to the dictionary
for sentence in train_dataset:
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
            
label_to_idx = {}
for label in train_dataset:
    if label not in label_to_idx:
        label_to_idx[label] = len(label_to_idx)

            
# for sentence, _ in train_dataset:
#     for word in sentence.split():
#         if word not in word_to_idx:
#             word_to_idx[word] = len(word_to_idx)
# word_to_idx['<UNK>'] = 1 # Add <UNK> key to the dictionary


In [35]:
idx_to_label={}
for key,val in train_label_idx.items():
    idx_to_label[val]=key

In [36]:
idx_to_label

{1: 'B-PER',
 2: 'I-MISC',
 3: 'I-LOC',
 4: 'B-LOC',
 5: 'I-PER',
 6: 'O',
 7: 'B-MISC',
 8: 'I-ORG',
 9: 'B-ORG',
 0: '<PAD>'}

In [37]:
def calculate_metrics(true_labels, predicted_labels):
    """
    Calculates precision, recall and F1 score for the given predictions and targets.
    """
    # Flatten the predictions and targets
    predicted_labels = np.array(predicted_labels)
    predictions_flat = predicted_labels.flatten()
    true_labels = np.array(true_labels)
    targets_flat = true_labels.flatten()

    # Calculate the number of true positives, false positives and false negatives
    tp = ((predictions_flat == 1) & (targets_flat == 1)).sum().item()
    fp = ((predictions_flat == 1) & (targets_flat == 0)).sum().item()
    fn = ((predictions_flat == 0) & (targets_flat == 1)).sum().item()

    # Calculate precision, recall and F1 score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

In [38]:
# Define the training loop
        
def train(model, train_loader, loss_fn, optimizer, scheduler, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        for i, (inputs, targets) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs.view(-1, num_classes), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            scheduler.step(total_loss/len(train_loader))
        print("Epoch: {}, Loss: {}".format(epoch+1, total_loss/(i+1)))
        
        model.eval()
        dev_loss = 0.0
        true_labels = []
        predicted_labels = []
        with torch.no_grad():
            
            for i, (inputs, targets) in enumerate(dev_loader):

                outputs = model(inputs)
                loss = loss_fn(outputs.view(-1, num_classes), targets.view(-1))
                dev_loss += loss.item()
                scheduler.step(dev_loss/len(dev_loader))
                true_labels.extend(targets.cpu().numpy())
                predicted_labels.extend(torch.argmax(outputs, axis=-1).cpu().numpy())
    
    
    with open('Predicted.txt', 'w') as pred:
        xP=[]
        for pred_tags in predicted_labels:
            t=[]
            for i in pred_tags:
                t.append(idx_to_label[i])
                
            xP.append(t)
        for inp in xP:
            pred.write(' '.join(inp)+'\n')      

    # Calculate metrics on dev set
    precision, recall, f1_score = calculate_metrics(true_labels, predicted_labels)
    print(f"Epoch {epoch+1}: train loss = {total_loss/len(train_dataset)}, dev loss = {dev_loss/len(dev_dataset)}, precision = {precision}, recall = {recall}, F1 score = {f1_score}")




In [39]:
# Train the model
model = BLSTM(embedding_dim, lstm_hidden_dim, lstm_layers, lstm_dropout, linear_output_dim)
# Define the loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)
train(model, train_loader, loss_fn, optimizer, )


Epoch: 1, Loss: 0.2195237660412791
Epoch 1: train loss = 0.013725728598737389, dev loss = 0.03187401251058386, precision = 0, recall = 0.0, F1 score = 0
