In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from sklearn.metrics import f1_score
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim.downloader
import pandas as pd
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score as seqeval_f1_score
from tqdm import tqdm
import time
import os
import multiprocessing
import numpy as np


In [2]:
num_cores = multiprocessing.cpu_count()
print(f"Number of CPU cores available: {num_cores}")

Number of CPU cores available: 16


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
pretrained_model = gensim.downloader.load('word2vec-google-news-300')

In [6]:
train_file = "eng.train"
dev_file = "eng.testa"
test_file = "eng.testb"

label_to_index = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6, "B-MISC": 7, "I-MISC": 8}
index_to_label = { 0: "O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG", 7:"B-MISC", 8:"I-MISC"}
word_to_index = pretrained_model.key_to_index
OOV_INDEX = 3000000

In [7]:
class CoNLL2003Dataset(torch.utils.data.Dataset):
    def __init__(self, filename, word_to_index, label_to_index, max_sequence_length):
        super(CoNLL2003Dataset, self).__init__()

        self.sentences = []  # List to store sentences
        self.labels = []     # List to store labels

        with open(filename, "r") as f:
            sentence = []
            labels_new = []
            for line in f:
                if line == "\n":
                    if sentence:  # Ignore empty lines
                        words = [word_to_index.get(word,OOV_INDEX) for word in sentence]
                        labels_index = [label_to_index[label] for label in labels_new]

                    # Applying Padding to ensure same input length
                        if len(words) < max_sequence_length:
                            for i in range(len(words), max_sequence_length):
                                words.append(0)
                                labels_index.append(0)
                        elif len(words) > max_sequence_length:
                            words = words[:max_sequence_length]
                            labels_index = labels_index[:max_sequence_length]

                        self.sentences.append(torch.LongTensor(np.array(words)))
                        self.labels.append(torch.LongTensor(np.array(labels_index)))
                    sentence = []
                    labels_new = []
                else:
                    parts = line.split()
                    if len(parts) > 1:
                        word, label = parts[0], parts[3]  # Assuming CoNLL-2003 format
                        sentence.append(word)
                        labels_new.append(label)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        return self.sentences[index], self.labels[index]

train_dataset = CoNLL2003Dataset(train_file, word_to_index, label_to_index,100)
dev_dataset = CoNLL2003Dataset(dev_file, word_to_index, label_to_index,100)
test_dataset = CoNLL2003Dataset(test_file, word_to_index, label_to_index,100)

In [8]:
pretrained_embeddings =pretrained_model.vectors
pretrained_embeddings = np.append(pretrained_embeddings, [[0] *300], axis=0)
pretrained_embeddings = torch.FloatTensor(pretrained_embeddings)

In [9]:
import torch.nn.utils.rnn as rnn_utils
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Convert sequences to tensors
    inputs, labels = zip(*[(torch.LongTensor(input_seq), torch.LongTensor(label_seq)) for input_seq, label_seq in batch])
    inputs = rnn_utils.pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = rnn_utils.pad_sequence(labels, batch_first=True, padding_value=0)
    return inputs, labels

# Hyperparameters
batch_size = 4
num_epochs = 10
learning_rate = 0.001

# Mini-batch data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [10]:
# Define the NERModel class
class NERModel(nn.Module):
    def __init__(self, embedding_dim, final_vector_dim, pretrained_embeddings):
        super(NERModel, self).__init__()

        # Create an embedding layer using the pre-trained Word2Vec embeddings.
        self.embedding_layer = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)

        # Create a linear layer.
        self.linear_layer = nn.Linear(embedding_dim, final_vector_dim)

    def forward(self, inputs):
        # Get the word embeddings.
        embeddings = self.embedding_layer(inputs)

        # Apply the linear layer.
        final_vector_representations = self.linear_layer(embeddings)

        return final_vector_representations

In [11]:
# Create a model.
model = NERModel(embedding_dim=300, final_vector_dim=300, pretrained_embeddings=pretrained_embeddings).to(device)

# Define the optimizer.
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Early stopping setup
best_dev_f1 = 0.0
patience = 3  # Number of epochs without improvement to wait before early stopping

# Initialize a variable to track the best model state
best_model_state = model.state_dict()

start_time = time.time()

In [13]:
index_to_label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-LOC',
 4: 'I-LOC',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-MISC',
 8: 'I-MISC'}

In [23]:
# Training loop
for epoch in range(num_epochs):
    model.train()

    # Create a tqdm progress bar for the training data
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}', unit='batch', leave=False)

    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = nn.functional.cross_entropy(outputs.view(-1, outputs.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()
        progress_bar.set_postfix(loss=loss.item())

    # Evaluation on the development set
    model.eval()
    dev_predictions = []
    dev_labels = []
    with torch.no_grad():
        for inputs, labels in dev_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            dev_predictions.extend(outputs.argmax(dim=2).tolist())
            dev_labels.extend(labels.tolist())

        converted_dev_predictions = []
        converted_dev_labels = []
        for sent in dev_predictions:
            converted_sent = []
            for i in sent:
                # Find the corresponding text label for the numerical label 'i'
                text_label = list(label_to_index.keys())[list(label_to_index.values()).index(i)]
                converted_sent.append(text_label)
            converted_dev_predictions.append(converted_sent)

        for sent in dev_labels:
            converted_sent = []
            for i in sent:
                # Find the corresponding text label for the numerical label 'i'
                text_label = list(label_to_index.keys())[list(label_to_index.values()).index(i)]
                converted_sent.append(text_label)
            converted_dev_labels.append(converted_sent)   
    

    dev_f1 = seqeval_f1_score(converted_dev_labels,converted_dev_predictions)

    print(f'Epoch {epoch + 1}, Dev F1: {dev_f1:.4f}')

    # Early stopping check
    if dev_f1 > best_dev_f1:
        best_dev_f1 = dev_f1
        # Save the model or its parameters if it improves

    # Early stopping condition
    if epoch > patience and dev_f1 <= best_dev_f1:
        print("Early stopping...")
        break

# Save the best model
torch.save(best_model_state, 'best_model_linear.pth')

# Evaluation on the test set
model.eval()

# Create a tqdm progress bar for the test data
test_predictions = []
test_labels = []
with torch.no_grad():
    for inputs, labels in tqdm(test_loader, desc='Testing', unit='batch', leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        test_predictions.extend(outputs.argmax(dim=2).tolist())
        test_labels.extend(labels.tolist())


converted_test_predictions = []
converted_test_labels = []
for sent in test_predictions:
    converted_sent = []
    for i in sent:
        # Find the corresponding text label for the numerical label 'i'
        text_label = list(label_to_index.keys())[list(label_to_index.values()).index(i)]
        converted_sent.append(text_label)
    converted_test_predictions.append(converted_sent)

for sent in test_labels:
    converted_sent = []
    for i in sent:
        # Find the corresponding text label for the numerical label 'i'
        text_label = list(label_to_index.keys())[list(label_to_index.values()).index(i)]
        converted_sent.append(text_label)
    converted_test_labels.append(converted_sent)   
    

# test_f1 = f1_score(test_labels, test_predictions, average='weighted')
test_f1 = seqeval_f1_score(converted_test_labels, converted_test_predictions)

print(f'Test F1: {test_f1:.4f}')

end_time = time.time()
print(f"Total running time: {end_time - start_time:.2f} seconds")


                                                                                                                       

Epoch 1, Dev F1: 0.6941


                                                                                                                       

Epoch 2, Dev F1: 0.6966


                                                                                                                       

Epoch 3, Dev F1: 0.6907


                                                                                                                       

Epoch 4, Dev F1: 0.6961


                                                                                                                       

Epoch 5, Dev F1: 0.6984
Early stopping...


                                                                                                                       

Test F1: 0.6451
Total running time: 933.38 seconds
