In [21]:
import os
import json
from tqdm import tqdm
from collections import Counter
import numpy as np

import torch
from torch.utils.data import DataLoader
import torch.utils.tensorboard as tensorboard
from seqeval.metrics import classification_report

from utils import set_random_seed, Config, load_ner_config
from dataset import io2df, io2bio, padding, NERDataset
from model import BiLSTM_CRF

%load_ext autoreload
%autoreload 2

set_random_seed(seed=0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
config = Config('config.yaml')
config.__dict__

In [None]:
tr_titles = io2df(config.TR_PATH)
va_titles = io2df(config.VA_PATH)

In [None]:
tr_titles

# Tag -> tagID

In [None]:
TAG2IDX = load_ner_config('ner_tags/ner_fine_grained.json')
IDX2TAG = {i: t for t, i in TAG2IDX.items()}

In [None]:
TAG2IDX

In [None]:
tr_titles['tags_ids'] = tr_titles['tags_fine_grained'].transform(lambda x: [TAG2IDX[tag] for tag in x])
va_titles['tags_ids'] = va_titles['tags_fine_grained'].transform(lambda x: [TAG2IDX[tag] for tag in x])

# Token -> tokenID

In [None]:
def calc_token_cntr(filepath):

    token_cntr = Counter()
    num_lines = sum(1 for _ in open(filepath, encoding="utf-8"))

    with open(filepath, "r", encoding="utf-8") as f:
        for line in tqdm(f, total=num_lines):
            line = line.strip().split()
            if line:
                token, fine_tag = line
                token_cntr[token] += 1
    
    return token_cntr

token_cntr = calc_token_cntr(filepath=config.TR_PATH)


MC = 50_000
top_tokens = [token for token, _ in token_cntr.most_common(MC)]
TOKEN2IDX = {token: i + 2 for i, token in enumerate(top_tokens)}
TOKEN2IDX['PAD'] = 0
TOKEN2IDX['UKN'] = 1

with open('tokenizers/token2idx.json', 'w') as f:
    json.dump(TOKEN2IDX, f, indent=4)

In [None]:
tr_titles['tokens_ids'] = tr_titles['tokens'].transform(lambda x: [TOKEN2IDX[token] if token in TOKEN2IDX else TOKEN2IDX['UKN'] for token in x])
va_titles['tokens_ids'] = va_titles['tokens'].transform(lambda x: [TOKEN2IDX[token] if token in TOKEN2IDX else TOKEN2IDX['UKN'] for token in x])

# Padding

In [None]:
tr_titles['tokens_ids'] = tr_titles['tokens_ids'].transform(padding, max_len=config.SEQ_LEN)
tr_titles['tags_ids'] = tr_titles['tags_ids'].transform(padding, max_len=config.SEQ_LEN)

va_titles['tokens_ids'] = va_titles['tokens_ids'].transform(padding, max_len=config.SEQ_LEN)
va_titles['tags_ids'] = va_titles['tags_ids'].transform(padding, max_len=config.SEQ_LEN)

In [None]:
tr_titles

# Data loader

In [None]:
tr_dataset = NERDataset(tr_titles)
va_dataset = NERDataset(va_titles)

tr_dataloader = DataLoader(dataset=tr_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=4)
va_dataloader = DataLoader(dataset=va_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=4)

In [None]:
ex = tr_dataset.__getitem__(1)
ex

# Model

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = BiLSTM_CRF(
    embed_size     = config.EMBED_SIZE,
    hidden_size    = config.HIDDEN_SIZE, 
    dropout        = config.DROPOUT,
    token_voc_size = len(TOKEN2IDX), 
    tag_voc_size   = len(TAG2IDX),
).to(device)
print(model)

# Train

In [None]:
weights_folder = 'weights'
if not os.path.exists(weights_folder):
    os.makedirs(weights_folder)
    
runs_folder = '.runs'
if not os.path.exists(runs_folder):
    os.makedirs(runs_folder)

In [None]:
# Make optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=config.LR)

# Make tensorboard writer
writer = tensorboard.SummaryWriter(log_dir='./runs')

for epoch in range(config.NUM_EPOCHS):

    # TRAINING PHASE
    
    tr_losses = []
    
    model.train()
    for tr_batch in tqdm(tr_dataloader, total=tr_dataloader.__len__()):
        optimizer.zero_grad()
        
        tr_xs = tr_batch['tokens_ids'].to(device)
        tr_ys = tr_batch['tags_ids'].to(device)
        
        # Calculate loss
        tr_emission_scores = model(x=tr_xs).to(device) # size: [batch=128, seq_len=100, 10]
        tr_loss = model.loss_fn(emission_scores=tr_emission_scores, tags=tr_ys, mask=(tr_ys > 0).bool())
        tr_losses.append(tr_loss.item())
        
        # Calculate total loss
        total_loss = tr_loss + model.regularization_loss_fn(lam=config.REG_LAMBDA, alpha=config.REG_ALPHA)
        
        # Backward pass: compute gradient of the loss w.r.t. all learnable parameters
        total_loss.backward()
        
        # Clip computed gradients
        grad_norm = torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.MAX_GRAD_NORM)
        
        # Optimize: update the weights using Adam optimizer
        optimizer.step()
        
    # END TRAINING PHASE AND UPDATE LOG

    with torch.no_grad():
        print(f"Epoch: {epoch:02d} NLL:{tr_loss.item()}")
        writer.add_scalar('tr/'+'loss', np.mean(tr_losses), global_step=epoch)
        writer.add_scalar('tr/'+'total_grad_norm', grad_norm, global_step=epoch)
        for name, param in model.named_parameters():
            writer.add_histogram('tr/'+name, param.data, global_step=epoch)
        print("tr loss", np.mean(tr_losses))
        
    # VALIDATION PHASE
    
    va_losses = []
    
    batch_preds = []
    batch_trues = []

    model.eval()
    with torch.no_grad():
        for va_batch in tqdm(va_dataloader, total=va_dataloader.__len__()):
            va_xs = va_batch['tokens_ids'].to(device) # size: [batch=128, seq_len=100]
            va_ys = va_batch['tags_ids'].to(device) # size: [batch=128, seq_len=100]

            # Forward pass: compute predicted output by passing input to the model
            va_emission_scores = model(x=va_xs).to(device) # size: [batch=128, seq_len=100]
            va_preds = torch.tensor(model.decode(va_emission_scores)).to(device)
            va_loss = model.loss_fn(emission_scores=va_emission_scores, tags=va_ys, mask=(va_ys > 0).bool())
            va_losses.append(va_loss.item())
            
            mask = (va_ys > 0).bool()

            for row_id, true in enumerate(va_ys):
                # do not count padding
                true_tags = true[mask[row_id]]
                # idx2tag
                true_tags = [IDX2TAG[idx] for idx in true_tags.tolist()]
                # convert to the format expected by seqeval
                true_tags = io2bio(true_tags)
                batch_trues.append(true_tags)

            for row_id, pred in enumerate(va_preds):
                # do not count padding
                pred_tags = pred[mask[row_id]]
                # idx2tag
                pred_tags = [IDX2TAG[idx] for idx in pred_tags.tolist()]
                # convert to the format expected by seqeval
                pred_tags = io2bio(pred_tags)
                batch_preds.append(pred_tags)
            
        for i in range(5):
            print('pred:', batch_preds[i])
            print('true:', batch_trues[i])
            print()

        print("va loss", np.mean(va_losses))
        writer.add_scalar('va/'+'loss', np.mean(va_losses), global_step=epoch)

        report = classification_report(y_true=batch_trues, y_pred=batch_preds, zero_division=0)
        print(report)

    torch.save(model.state_dict(), f"weights/model_epoch_{epoch:02d}.pt")

writer.close()

In [20]:
# !tensorboard --logdir=artefacts/fine_grained/runs