In [3]:
import pandas as pd
import itertools
import numpy as np
import os
import itertools
from collections import Counter


In [5]:
label_mapping = {
    'O': 0,
    'B-ORG': 1,
    'I-ORG': 2,
    'B-PER': 3,
    'I-PER': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    'B-MISC': 7,
    'I-MISC': 8
}

In [6]:
from datasets import Dataset, DatasetDict
def file_to_dataset_for_test(file_path):
    tokens = []
    data = {'id': [], 'tokens': []}
    current_id = 0
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():  # If the line is not empty, process it
                parts = line.strip().split(' ')
                if len(parts) == 2:  # Ensure the line has three parts: index, word
                    _, token = parts
                    tokens.append(token)
            else:  # If the line is empty, it indicates the end of a sentence
                if tokens:  # If there are tokens collected, save the current sentence
                    data['id'].append(str(current_id))
                    data['tokens'].append(tokens)
                    current_id += 1
                    tokens = []
                
        # Add the last sentence if the file doesn't end with a blank line
        if tokens:
            data['id'].append(str(current_id))
            data['tokens'].append(tokens)
    
    dataset = Dataset.from_dict(data)
    return dataset

test_dataset = file_to_dataset_for_test('data/test')
test_dataset

Dataset({
    features: ['id', 'tokens'],
    num_rows: 3684
})

In [7]:
def file_to_dataset(file_path, label_mapping):
    tokens = []
    labels = []
    data = {'id': [], 'tokens': [], 'labels': []}
    current_id = 0
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():  # If the line is not empty, process it
                parts = line.strip().split(' ')
                if len(parts) == 3:  # Ensure the line has three parts: index, word, NER tag
                    _, token, label = parts
                    tokens.append(token)
                    labels.append(label_mapping[label])  # Convert label to integer
            else:  # If the line is empty, it indicates the end of a sentence
                if tokens:  # If there are tokens collected, save the current sentence
                    data['id'].append(str(current_id))
                    data['tokens'].append(tokens)
                    data['labels'].append(labels)
                    current_id += 1
                    tokens, labels = [], []  # Reset for the next sentence
                
        # Add the last sentence if the file doesn't end with a blank line
        if tokens:
            data['id'].append(str(current_id))
            data['tokens'].append(tokens)
            data['labels'].append(labels)
    
    dataset = Dataset.from_dict(data)
    return dataset

train_path = 'data/train'
validation_path = 'data/dev'

# Converting files to Hugging Face Datasets
train_dataset = file_to_dataset(train_path, label_mapping)
validation_dataset = file_to_dataset(validation_path, label_mapping)

# Combining datasets into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset
})



In [8]:
tag2idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
idx2tag = {v:k for k, v in tag2idx.items()}
idx2tag

In [9]:
from itertools import chain
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import itertools
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

def calculate_metrics(labels, preds):
    # Flatten the lists
    flat_labels = list(chain(*labels))
    flat_preds = list(chain(*preds))

    # Initialize counts
    TP = FP = FN = 0

    # Calculate TP, FP, FN
    for true_label, pred_label in zip(flat_labels, flat_preds):
        if true_label != 'O':  # Entity present in the ground truth
            if true_label == pred_label:
                TP += 1  # Correctly identified entity
            else:
                FN += 1  # Missed entity
        if pred_label != 'O':  # Entity predicted
            if true_label != pred_label:
                FP += 1  # Incorrectly identified entity

    # Calculate precision, recall, and F1 score
    precision = (TP / (TP + FP) if (TP + FP) else 0) * 100
    recall = (TP / (TP + FN) if (TP + FN) else 0) * 100
    f1 = (2 * (precision * recall) / (precision + recall)) if (precision + recall) else 0

    return precision, recall, f1

cuda


In [10]:
d = {'preds': [], 'labels': [], 'mask': [], 'loss': []}
def eval_dataloader(loader, model, loss_fn, verbose=False, name=''):
    
    # using cuda only as required. Only GPU
    for batch in loader:
        input_ids = batch['input_ids'].to('cuda')
        labels = batch['labels'].to('cuda')
        mask = batch['mask'].to('cuda')
        
        predictions = model(input_ids)
        # print(predictions[0])
        loss = loss_fn(predictions.transpose(-1, -2), labels.to(torch.long))
        loss = torch.masked_select(loss, mask.bool()).mean()
        
        d['preds'].extend(predictions.argmax(-1).tolist())
        d['labels'].extend(labels.tolist())
        d['mask'].extend(mask.tolist())
        d['loss'].append(loss.item())
        
    preds, labels = [], []
    # print(d['preds'])
    # create eval fun that support data
    for i in range(len(d['preds'])):
        pred = [idx2tag[k] for k, m in zip(d['preds'][i], d['mask'][i]) if m > 0]
        label = [idx2tag[k] for k, m in zip(d['labels'][i], d['mask'][i]) if m > 0]
        preds.append(pred)
        labels.append(label)

    
    precision, recall, f1 = calculate_metrics(labels, preds)
    # print/return the average loss and f1
    print(f'name: {name}, f1: {f1}, precision: {precision}, recall: {recall}, loss: {sum(d["loss"])/len(d["loss"])}')
    return (f1, precision, recall), sum(d['loss'])/len(d['loss'])

In [11]:
# iterate on tokens and count using Counter class
word_frequency = Counter(itertools.chain(*dataset['train']['tokens']))
word2idx = {
    word: frequency
    for word, frequency in word_frequency.items()
    if frequency >= 3
}

word2idx = {
    word: index
    for index, word in enumerate(word2idx.keys(), start=2)
}
print('vocab count', len(word2idx))

vocab count 8127


In [12]:
word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1
idx2word = {v:k for k, v in word2idx.items()}

In [13]:
def convert_word_to_id(sample):
    return {
        'input_ids': [
        (word2idx[token] if token in word2idx else word2idx['[UNK]'])
        for token in sample['tokens']
        ]
}
dataset_ids = dataset.map(convert_word_to_id)
test_dataset_ids = test_dataset.map(convert_word_to_id)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

In [14]:
class MaxLenFinder:
    def __init__(self):
        self.max_len = 0

    def get_max_len(self, sample):
        if len(sample['tokens']) > self.max_len: 
            self.max_len = len(sample['tokens'])
        # It's important for the map function that we return the sample unchanged.
        return sample

# Usage for dataset 1
max_len_finder = MaxLenFinder()
dataset.map(max_len_finder.get_max_len)
train_max_len = max_len_finder.max_len

test_len_finer = MaxLenFinder()
test_dataset.map(max_len_finder.get_max_len)
test_max_len = max_len_finder.max_len


max_len = max(train_max_len, test_max_len)


Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

In [15]:
# dataset class. NERDataset
class NER_Dataset_test(Dataset):
    def __init__(self, dataset, vocab, max_len=128):
        self.dataset = dataset
        self.vocab = vocab
        self.max_len = max_len
    def __len__(self):
        return self.dataset.num_rows
    def __getitem__(self, idx):
        data = self.dataset[idx]
        input_ids = data['input_ids']
        mask = [1] * len(data['input_ids'])
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.int32),
            'mask': torch.tensor(mask, dtype=torch.int8)
        }

NERtest_dataset = NER_Dataset_test(test_dataset_ids, word2idx)

In [16]:
# dataset class. NERDataset
class NER_Dataset(Dataset):
    def __init__(self, dataset, vocab, max_len=128):
        self.dataset = dataset
        self.vocab = vocab
        self.max_len = max_len
    def __len__(self):
        return self.dataset.num_rows
    def __getitem__(self, idx):
        data = self.dataset[idx]
        input_ids = data['input_ids']
        label = data['labels']
        mask = [1] * len(data['input_ids'])
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.int32),
            'labels': torch.tensor(label, dtype=torch.int8),
            'mask': torch.tensor(mask, dtype=torch.int8)
        }

In [17]:
# create datasets. Train, validation, test
train_dataset = NER_Dataset(dataset_ids['train'], word2idx)
val_dataset = NER_Dataset(dataset_ids['validation'], word2idx)

In [18]:
# define collate function. Passing the collate function can inputs
def collate_fn(inputs, pad_token_id = 0):
    return {
        'input_ids': nn.utils.rnn.pad_sequence([i['input_ids'] for i in inputs], batch_first=True, padding_value=pad_token_id),
        'labels': nn.utils.rnn.pad_sequence([i['labels'] for i in inputs], batch_first=True, padding_value=pad_token_id),
        'mask': nn.utils.rnn.pad_sequence([i['mask'] for i in inputs], batch_first=True, padding_value=pad_token_id),
    }

In [19]:

def collate_fn_test(batch):
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    masks_padded = pad_sequence(masks, batch_first=True, padding_value=0)

    return {'input_ids': input_ids_padded, 'mask': masks_padded}


In [20]:
# create data loader
batch_size=32
train_loader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, collate_fn=collate_fn, batch_size=batch_size)

In [21]:
test_loader = DataLoader(NERtest_dataset, collate_fn=collate_fn_test, batch_size=batch_size)

In [23]:
class BiLSTM(nn.Module):
        def __init__(self, input_dim, output_dim, embed_dim=100, lstm_dim=256, dropout=0.50, linear_dim=128):
                super().__init__()
                self.input_dim = input_dim
                self.embed_dim = embed_dim
                self.embedding = nn.Embedding(self.input_dim, self.embed_dim)
                self.lstm_dim = lstm_dim
                self.lstm = nn.LSTM(self.embed_dim, self.lstm_dim, batch_first=True, bidirectional=True)
                self.dropout = nn.Dropout(dropout)
                self.linear_dim = linear_dim
                self.fc2 = nn.Linear(self.lstm_dim * 2, self.linear_dim)
                self.elu = nn.ELU()
                self.output_dim = output_dim
                self.fc3 = nn.Linear(self.linear_dim, self.output_dim)
            
        def forward(self, input_ids):
                batch_size = input_ids.shape[0]
                emds = self.embedding(input_ids)
                h0 = torch.randn(2, batch_size, self.lstm_dim).to(input_ids.device)
                c0 = torch.randn(2, batch_size, self.lstm_dim).to(input_ids.device)
                output, (hn, cn) = self.lstm(emds, (h0, c0))
                output = self.dropout(output)
                linear_out = self.fc2(output)
                linear_out = self.dropout(linear_out)
                elu_out = self.elu(linear_out)
                results = self.fc3(elu_out)
                return results

In [24]:
bilstm_model = BiLSTM(input_dim=len(word2idx), output_dim=len(tag2idx)).to(device)
def create_hyperparameters(model, epochs):
        loss_fn = nn.CrossEntropyLoss(reduction='none')
        optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100, eta_min=1e-8)
        num_epochs = epochs
        return loss_fn, optimizer, scheduler, num_epochs

In [25]:
loss_fn, optimizer, scheduler, num_epochs = create_hyperparameters(bilstm_model, 10)
eval_dataloader(val_loader, bilstm_model, loss_fn=loss_fn, name='test')

name: test, f1: 3.2961477536988983, precision: 1.927511381777331, recall: 11.368127397419505, loss: 2.228275550614803


((3.2961477536988983, 1.927511381777331, 11.368127397419505),
 2.228275550614803)

In [26]:
def training_loop(loss_fn, optimizer, scheduler, num_epochs, model, train_loader):
        losses = []
        vlosses = []
        lrs = []
        for epoch in range(num_epochs):
                model.train()
                total_loss = 0.0
                loss = 0.0
                pbar = tqdm(train_loader)
                for batch in pbar:
                        input_ids = batch['input_ids'].to(device)
                        labels = batch['labels'].to(device)
                        mask = batch['mask'].to(device)
                        optimizer.zero_grad()
                        predictions = model(input_ids)
                        loss = loss_fn(predictions.transpose(-1, -2), labels.to(torch.long))
                        loss = torch.masked_select(loss, mask.bool()).mean()
                        loss.backward()
                        optimizer.step()
                        total_loss += loss.item()
                        pbar.set_postfix({'loss': loss.item()})
                lrs.append(scheduler.get_last_lr())
                scheduler.step()
                average_loss = total_loss / len(train_loader)
                print(f'Epoch [{epoch + 1}/{num_epochs}] - Loss: {average_loss:.4f}')
                model.eval()
                f1, l = eval_dataloader(train_loader, model, loss_fn=loss_fn, name='train')
                losses.append(l)
        return losses, vlosses, lrs

In [27]:
losses, vlosses, lrs = training_loop(*create_hyperparameters(bilstm_model, 25), bilstm_model, train_loader)

  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [1/25] - Loss: 0.4628
name: train, f1: 39.19411827105892, precision: 30.05644343766817, recall: 56.3147774703372, loss: 0.5883450008361604


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [2/25] - Loss: 0.1878
name: train, f1: 55.68650496847954, precision: 47.168547664515664, recall: 67.95889892944228, loss: 0.3718691422731225


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [3/25] - Loss: 0.1178
name: train, f1: 65.29989112082433, precision: 57.97492294760437, recall: 74.74352490698263, loss: 0.27679784102291655


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [4/25] - Loss: 0.0870
name: train, f1: 71.31294655259059, precision: 65.05444315360259, recall: 78.90381626662062, loss: 0.22274377053129657


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [5/25] - Loss: 0.0661
name: train, f1: 75.56167085300655, precision: 70.17496751425297, recall: 81.84410965339059, loss: 0.18697064916466452


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [6/25] - Loss: 0.0549
name: train, f1: 78.71219332048106, precision: 73.96592467734206, recall: 84.10934835409023, loss: 0.1612691531094665


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [7/25] - Loss: 0.0441
name: train, f1: 81.15035171109936, precision: 76.91451081673219, recall: 85.87993714156109, loss: 0.14190986242856696


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [8/25] - Loss: 0.0358
name: train, f1: 83.08023825496535, precision: 79.26683799641178, recall: 87.27909534538543, loss: 0.12665787590589664


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [9/25] - Loss: 0.0309
name: train, f1: 84.6502016971428, precision: 81.17403657026298, recall: 88.43741071145116, loss: 0.11442547418087329


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [10/25] - Loss: 0.0279
name: train, f1: 85.94575011330186, precision: 82.7680258102859, recall: 89.37722221108034, loss: 0.10445767650376303


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [11/25] - Loss: 0.0246
name: train, f1: 87.03355894071602, precision: 84.0985829432228, recall: 90.18079963244891, loss: 0.0961708495179238


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [12/25] - Loss: 0.0228
name: train, f1: 87.96037839606291, precision: 85.22013780390482, recall: 90.88269774332984, loss: 0.08913552586473161


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [13/25] - Loss: 0.0217
name: train, f1: 88.76137465884088, precision: 86.20341050556024, recall: 91.4757891843728, loss: 0.08306508978481508


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [14/25] - Loss: 0.0191
name: train, f1: 89.45176515386851, precision: 87.05114827636648, recall: 91.98854092600035, loss: 0.07782477639869759


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [15/25] - Loss: 0.0179
name: train, f1: 90.07030322133976, precision: 87.80843817335095, recall: 92.4517764151234, loss: 0.07317819356272341


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [16/25] - Loss: 0.0166
name: train, f1: 90.61672458811984, precision: 88.47698358578032, recall: 92.86252622941635, loss: 0.0690941501688288


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [17/25] - Loss: 0.0150
name: train, f1: 91.10150702766558, precision: 89.073549579389, recall: 93.22395774806158, loss: 0.0654524054361975


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [18/25] - Loss: 0.0143
name: train, f1: 91.54090110291568, precision: 89.61232351540208, recall: 93.5543156570005, loss: 0.062161642055134476


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [19/25] - Loss: 0.0139
name: train, f1: 91.93664458418806, precision: 90.09688829650895, recall: 93.85310182783559, loss: 0.05921358211919012


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [20/25] - Loss: 0.0127
name: train, f1: 92.29434557438461, precision: 90.53341443271356, recall: 94.12513796969526, loss: 0.05654616384807784


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [21/25] - Loss: 0.0131
name: train, f1: 92.6189545994693, precision: 90.9331973075417, recall: 94.3683950098548, loss: 0.05413599193343904


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [22/25] - Loss: 0.0132
name: train, f1: 92.91476110885635, precision: 91.29765604196369, recall: 94.59018492533157, loss: 0.051941631767740776


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [23/25] - Loss: 0.0122
name: train, f1: 93.18969976162491, precision: 91.63616825111889, recall: 94.79681452061163, loss: 0.049897433566591674


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [24/25] - Loss: 0.0117
name: train, f1: 93.43814821742814, precision: 91.93898760502552, recall: 94.9870099983649, loss: 0.04806991260588346


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [25/25] - Loss: 0.0120
name: train, f1: 93.6708962213383, precision: 92.22424634586812, recall: 95.1636542984699, loss: 0.046346160613460904


In [28]:
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# It's good practice to include the '.pt' extension for PyTorch model files
model_path = os.path.join(models_dir, 'blstm1.pt')  # Updated to include the .pt extension

# Save the model state dictionary, overwriting any existing file
torch.save(bilstm_model.state_dict(), model_path)

print(f"Model saved to {model_path}")

Model saved to models\blstm1


In [29]:
# F1, precision, and recall result for eval
f1 = eval_dataloader(val_loader, bilstm_model.eval(), loss_fn=loss_fn, name='Dev', verbose=True)

name: test, f1: 93.57272255628227, precision: 92.15702041828331, recall: 95.03259889367612, loss: 0.048918584548751


In [30]:
def write_predictions_to_file(dataloader, model, idx2tag, file_path):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad(), open(file_path, 'w', encoding='utf-8') as file:
        for batch in dataloader:
            input_ids = batch['input_ids'].to('cuda')
            # No labels are needed, as we're just predicting
            mask = batch['mask'].to('cuda')

            predictions = model(input_ids)
            predictions = predictions.argmax(-1)  # Get the most likely prediction indices

            # Convert the batch of predictions to tags
            for sentence_preds, sentence_mask in zip(predictions, mask):
                idx = 1  # Start index for each new sentence
                for pred, m in zip(sentence_preds, sentence_mask):
                    if m == 0:  # Skip padding
                        continue
                    tag = idx2tag[int(pred)]
                    
                    file.write(f"{idx} TOKEN {tag}\n")
                    idx += 1
                file.write("\n")  # Separate sentences by a blank line

In [31]:
def process_predictions(loader, model, idx2tag, dataset_partition, output_file, temp_file='temp_predictions.txt'):
    """
    Args:
    - loader: DataLoader for the dataset.
    - model: The model to make predictions.
    - idx2tag: Dictionary to map prediction indices to tags.
    - dataset_partition: Part of the dataset to use (e.g., 'validation', 'test').
    - output_file: File path for the output file.
    """
    # Step 1: Write predictions to a temporary file
    write_predictions_to_file(loader, model, idx2tag, temp_file)

    # Step 2: Read tokens from the dataset
    token_lists = [dataset_partition[i]['tokens'] for i in range(len(dataset_partition))]

    # Step 3: Read and process the temporary file content
    with open(temp_file, 'r') as file:
        content = file.read()

    blocks = content.strip().split('\n\n')
    assert len(blocks) == len(token_lists), "The number of blocks and token lists do not match."

    processed_blocks = []
    for block, tokens in zip(blocks, token_lists):
        lines = block.split('\n')
        new_lines = []
        for line, token in zip(lines, tokens):
            new_line = line.replace('TOKEN', token)
            new_lines.append(new_line)
        processed_blocks.append('\n'.join(new_lines))

    output_content = '\n\n'.join(processed_blocks)

    # Step 4: Write the processed content to the specified output file
    with open(output_file, 'w') as file:
        file.write(output_content)

    # Step 5: Clean up the temporary file
    if os.path.exists(temp_file):
        os.remove(temp_file)

In [33]:
from torch.nn.utils.rnn import pad_sequence

process_predictions(val_loader, bilstm_model, idx2tag, dataset['validation'], 'dev1.out')
process_predictions(test_loader, bilstm_model, idx2tag, test_dataset, 'test1.out')

In [34]:
glove_path = 'glove.6B.100d.txt'
glove = {}
with open(glove_path, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove[word] = [-0.1, *vector]
        if word[0].upper() != word[0]:
            glove[word[0].upper()+word[1:]] = [0.1, *vector]
            if len(word) > 1:
                glove[word.upper()] = [0.2, *vector]

In [35]:
pu_emb = torch.zeros(2, 101)
pu_emb.shape

torch.Size([2, 101])

In [36]:
glove_tensored = torch.tensor(list(glove.values()))
glove_tensored.mean(), glove_tensored.std()

(tensor(0.0049), tensor(0.4074))

In [37]:
glove_embeddings = torch.cat([pu_emb, glove_tensored], dim=0)

In [38]:
gword2idx = {
    word: index
    for index, word in enumerate(glove.keys(), start=2)
}
gword2idx['[PAD]'] = 0
gword2idx['[UNK]'] = 1

In [39]:
# Convert all tokens to their respective indx
def gconvert_word_to_id(sample):
    return {
        'input_ids': [
            (gword2idx[token] if token in gword2idx else gword2idx['[UNK]'])
            for token in sample['tokens']
    ]
}
gdataset_ids = dataset.map(gconvert_word_to_id)

Map:   0%|          | 0/14987 [00:00<?, ? examples/s]

Map:   0%|          | 0/3466 [00:00<?, ? examples/s]

In [40]:
test_gdataset_ids = test_dataset.map(gconvert_word_to_id)

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

In [41]:
gtrain_dataset = NER_Dataset(gdataset_ids['train'], gword2idx)
gval_dataset = NER_Dataset(gdataset_ids['validation'], gword2idx)

In [42]:
gtest_dataset = NER_Dataset_test(test_gdataset_ids, gword2idx)

In [43]:
# create data loader
gtrain_loader = DataLoader(gtrain_dataset, collate_fn=collate_fn, batch_size=batch_size, shuffle=True)
gval_loader = DataLoader(gval_dataset, collate_fn=collate_fn, batch_size=batch_size)

In [44]:
gtest_loader = DataLoader(gtest_dataset, collate_fn=collate_fn_test, batch_size=batch_size)

In [45]:
class GloveBiLSTM(nn.Module):
    def __init__(self, output_dim, glove_embeddings, embed_dim=101, lstm_dim=256,
                dropout=0.50, linear_dim=128):
        super().__init__()
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding.from_pretrained(glove_embeddings, freeze=True)
        self.lstm_dim = lstm_dim
        self.lstm = nn.LSTM(self.embed_dim, self.lstm_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.linear_dim = linear_dim
        self.fc2 = nn.Linear(self.lstm_dim * 2, self.linear_dim)
        self.elu = nn.ELU()
        self.output_dim = output_dim
        self.fc3 = nn.Linear(self.linear_dim, self.output_dim)

    def forward(self, input_ids):
        batch_size = input_ids.shape[0]
        emds = self.embedding(input_ids)
        h0 = torch.randn(2, batch_size, self.lstm_dim).to(input_ids.device)
        c0 = torch.randn(2, batch_size, self.lstm_dim).to(input_ids.device)
        output, (hn, cn) = self.lstm(emds, (h0, c0))
        output = self.dropout(output)
        linear_out = self.fc2(output)
        linear_out = self.dropout(linear_out)
        elu_out = self.elu(linear_out)
        results = self.fc3(elu_out)
        return results

In [46]:
# train model
gbilstm_model = GloveBiLSTM(output_dim=len(tag2idx), glove_embeddings=glove_embeddings).to(device)
losses, vlosses, lrs = training_loop(*create_hyperparameters(gbilstm_model, 25), gbilstm_model, gtrain_loader)

  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [1/25] - Loss: 0.2495
name: train, f1: 93.30477285953549, precision: 92.02812082642116, recall: 94.61734365926209, loss: 0.05088261464442016


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [2/25] - Loss: 0.1082
name: train, f1: 93.22215527924445, precision: 92.03826097817375, recall: 94.43690347908459, loss: 0.05145425501858018


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [3/25] - Loss: 0.0816
name: train, f1: 93.22498549367319, precision: 92.10731707317073, recall: 94.37011160231242, loss: 0.0514117906873285


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [4/25] - Loss: 0.0662
name: train, f1: 93.29199415925093, precision: 92.2163120050031, recall: 94.39306766966698, loss: 0.050918564567029256


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [5/25] - Loss: 0.0562
name: train, f1: 93.37339618860199, precision: 92.33140934454148, recall: 94.43916972236774, loss: 0.050274979134563046


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [6/25] - Loss: 0.0475
name: train, f1: 93.47320508067892, precision: 92.46003621255035, recall: 94.50882438773787, loss: 0.04950516607013134


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [7/25] - Loss: 0.0407
name: train, f1: 93.58757728880354, precision: 92.59828993296665, recall: 94.59823131046772, loss: 0.048618305919922


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [8/25] - Loss: 0.0362
name: train, f1: 93.71215112217936, precision: 92.76075463919297, recall: 94.68326575342466, loss: 0.04765335966539777


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [9/25] - Loss: 0.0320
name: train, f1: 93.84542640193314, precision: 92.9225844033803, recall: 94.78678230785209, loss: 0.0466542746977541


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [10/25] - Loss: 0.0280
name: train, f1: 93.97108730669838, precision: 93.07193987144291, recall: 94.88777714441252, loss: 0.0456862085998661


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [11/25] - Loss: 0.0267
name: train, f1: 94.10139005581586, precision: 93.22979600540194, recall: 94.98943475539005, loss: 0.04470715560675574


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [12/25] - Loss: 0.0235
name: train, f1: 94.22795883854474, precision: 93.3807473905476, recall: 95.0906839536747, loss: 0.043738315638063484


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [13/25] - Loss: 0.0212
name: train, f1: 94.35213886932382, precision: 93.52688057599468, recall: 95.19209056788014, loss: 0.04280115883456432


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [14/25] - Loss: 0.0194
name: train, f1: 94.46872060692374, precision: 93.658963664759, recall: 95.29260166125975, loss: 0.04192177358612203


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [15/25] - Loss: 0.0194
name: train, f1: 94.58459456456828, precision: 93.79243120983274, recall: 95.39025299399678, loss: 0.04104639045533439


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [16/25] - Loss: 0.0180
name: train, f1: 94.69719170190535, precision: 93.92456739310836, recall: 95.48263266922345, loss: 0.04019742836018451


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [17/25] - Loss: 0.0160
name: train, f1: 94.80815503897226, precision: 94.05262885868797, recall: 95.57591782238157, loss: 0.03936398553587851


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [18/25] - Loss: 0.0149
name: train, f1: 94.9132865497938, precision: 94.17475147361061, recall: 95.66349662909211, loss: 0.03856768512052801


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [19/25] - Loss: 0.0148
name: train, f1: 95.01543084544791, precision: 94.29200416502549, recall: 95.75004389155025, loss: 0.037801812855466244


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [20/25] - Loss: 0.0137
name: train, f1: 95.11051029021465, precision: 94.40367462547233, recall: 95.82801049097533, loss: 0.03708856678931319


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [21/25] - Loss: 0.0140
name: train, f1: 95.20354432856016, precision: 94.51010464412781, recall: 95.90723504027328, loss: 0.03638357081493175


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [22/25] - Loss: 0.0130
name: train, f1: 95.29502730480922, precision: 94.6155243370815, recall: 95.98436088440275, loss: 0.03569285177138105


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [23/25] - Loss: 0.0129
name: train, f1: 95.38196490637291, precision: 94.71502188464731, recall: 96.05836719615812, loss: 0.035038427038201


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [24/25] - Loss: 0.0122
name: train, f1: 95.46743444933671, precision: 94.81401975650537, recall: 96.12991770668118, loss: 0.03439770329748724


  0%|          | 0/469 [00:00<?, ?it/s]

Epoch [25/25] - Loss: 0.0130
name: train, f1: 95.54805925676884, precision: 94.90548813200849, recall: 96.19939093474534, loss: 0.0337908519162668


In [47]:
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Save the model state dictionary, overwriting any existing file
model_path = os.path.join(models_dir, 'blstm2.pt')
torch.save(gbilstm_model.state_dict(), model_path)

print(f"Model saved to {model_path}")


Model saved to models\blstm2


In [48]:
# F1, precision, and recall result for eval
f1 = eval_dataloader(gval_loader, gbilstm_model.eval(), loss_fn=loss_fn, name='Dev', verbose=True)

name: test, f1: 95.53320150832337, precision: 94.89385925895897, recall: 96.1812172626781, loss: 0.034131143978649976


In [49]:
process_predictions(gval_loader, gbilstm_model, idx2tag, dataset['validation'], 'dev2.out')
process_predictions(gtest_loader, gbilstm_model, idx2tag, test_dataset, 'test2.out')