# Many-to-many NLP task.

In [1]:
!pip install torch -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
!pip install torch-geometric -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cpu.html

Looking in links: https://data.pyg.org/whl/torch-2.2.0+cpu.html
Looking in links: https://data.pyg.org/whl/torch-2.2.0+cpu.html
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1
Looking in links: https://data.pyg.org/whl/torch-2.2.0+cpu.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_sparse-0.6.18%2Bpt22cpu-cp310-cp310-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Succ

In [2]:
!pip install -U torchtext==0.17.0

Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting torch==2.2.0 (from torchtext==0.17.0)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.0)
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.0->torcht

In [3]:
import pandas as pd
import torch
import warnings

warnings.filterwarnings('ignore')

## Data reading and preprocessing

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train.head()

Unnamed: 0,sentence_id,entity_id,entity,tag
0,0,0,It,PRON
1,0,1,is,VERB
2,0,2,true,ADJ
3,0,3,that,ADP
4,0,4,his,DET


In [6]:
test.head()

Unnamed: 0,id,sentence_id,entity_id,entity
0,0,0,0,In
1,1,0,1,another
2,2,0,2,setback
3,3,0,3,yesterday
4,4,0,4,","


In [7]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.2
train_split, val_split = train_test_split(range(train['sentence_id'].max()), test_size=VALIDATION_RATIO, random_state=420)

In [8]:
train_dataframe = train[train['sentence_id'].isin(train_split)]
val_dataframe = train[train['sentence_id'].isin(val_split)]

In [9]:
pos_tags = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X']
cat2idx = {tag: i for i, tag in enumerate(pos_tags)}
idx2cat = {v: k for k, v in cat2idx.items()}

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [10]:
import torch
torch.manual_seed(420)
from torchtext.vocab import build_vocab_from_iterator


class PosTaggingDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab or self._create_vocab()

    def _preprocess(self):
        # fill missing values in entities
        self.dataframe['entity'].fillna('', inplace=True)

        # Fill missing tag to `other` - `X`
        self.dataframe['tag'].fillna('X')

        # Clean entities column
        self.dataframe['entity'] = self.dataframe['entity'].str.lower()

        # Split the dataset, so that we will have
        # full sentences and full tags by the same index
        grouped_dataframe = self.dataframe.groupby(by='sentence_id')

        self.sentences = grouped_dataframe['entity'].apply(list).to_list()
        self.tags = grouped_dataframe['tag'].apply(list).to_list()

    def _create_vocab(self):
        # creates vocabulary that is used for encoding
        # the sequence of tokens (splitted sentence)
        vocab = build_vocab_from_iterator(
            self.sentences,
            specials=special_symbols,
            special_first=True,
        )
        vocab.set_default_index(UNK_IDX)
        return vocab

    def _get_sentence(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.sentences[index]
        if self.vocab is None:
            return sent
        return self.vocab(sent)

    def _get_labels(self, index: int) -> list:
        # retrieves tags from dataset by index
        tags = self.tags[index]
        return [cat2idx.get(tag, cat2idx['X']) for tag in tags]

    def __getitem__(self, index) -> tuple[list, list]:
        return self._get_sentence(index), self._get_labels(index)

    def __len__(self) -> int:
        return len(self.sentences)

In [11]:
# Create train dataset
train_dataset = PosTaggingDataset(dataframe=train_dataframe)
val_dataset = PosTaggingDataset(dataframe=val_dataframe, vocab=train_dataset.vocab)

In [12]:
batch_size = 128
max_size = 50

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch: list):
    # Collate list of samples into tensor batch
    # As an input we have list of pair from dataset:
    # [([ent1, ent2, ...], [tag1, tag2, ...]), ([ent1, ent2, ...], [tag1, tag2, ...]), ...]
    # as an output, we want to have tensor of entities and tensor of tags
    sentences_batch, postags_batch = [], []
    for _sent, _postags in batch:
        _sent_tensor = torch.Tensor(_sent)
        _postags_tensor = torch.Tensor(_postags)
        if len(_sent) > max_size:
            sentences_batch.append(_sent_tensor[:max_size])
            postags_batch.append(_postags_tensor[:max_size])
        else:
            sent_padding = torch.Tensor([1] * (max_size - len(_sent)))
            tags_padding = torch.Tensor([cat2idx['X']] * (max_size - len(_sent)))

            sentences_batch.append(torch.concat((_sent_tensor, sent_padding)))
            postags_batch.append(torch.concat((_postags_tensor, tags_padding)))

    return torch.stack(sentences_batch, dim=0).int().T.to(device), torch.stack(postags_batch, dim=0).T.long().to(device)

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

## Creating the network


In [13]:
import torch.nn as nn

class BiLSTMPOSTagger(nn.Module):
    def __init__(self,
                 input_dim,
                 embedding_dim,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout,
                 pad_idx):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_idx)

        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout if n_layers > 1 else 0)

        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        #text = [sent len, batch size]

        #pass text through embedding layer
        embedded = self.dropout(self.embedding(text))

        #embedded = [sent len, batch size, emb dim]

        #pass embeddings into LSTM
        outputs, (hidden, cell) = self.lstm(embedded)

        #outputs holds the backward and forward hidden states in the final layer
        #hidden and cell are the backward and forward hidden and cell states at the final time-step

        #output = [sent len, batch size, hid dim * n directions]
        #hidden/cell = [n layers * n directions, batch size, hid dim]

        #we use our outputs to make a prediction of what the tag should be
        predictions = self.fc(self.dropout(outputs))

        #predictions = [sent len, batch size, output dim]

        return predictions

## Training

In [14]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total = 0
    for i, batch in loop:
        texts, labels = batch
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass and loss calculation
        outputs = model(texts)
        outputs = outputs.view(-1, outputs.shape[-1])

        labels = labels.reshape(-1)
        loss = loss_fn(outputs, labels)

        # backward pass
        loss.backward()
        total += labels.size(0)

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/total})

def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):

    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch

            # forward pass and loss calculation
            outputs = model(texts)
            outputs = outputs.view(-1, outputs.shape[-1])

            labels = labels.reshape(-1)
            loss = loss_fn(outputs, labels)

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            val_loss += loss.item()
            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})

        if correct / total > best:
            torch.save(model.state_dict(), ckpt_path)
            return correct / total

    return best_so_far

In [15]:
INPUT_DIM = len(train_dataset.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = len(pos_tags)
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
PAD_IDX = 1

num_epochs = 20


model = BiLSTMPOSTagger(INPUT_DIM,
                        EMBEDDING_DIM,
                        HIDDEN_DIM,
                        OUTPUT_DIM,
                        N_LAYERS,
                        BIDIRECTIONAL,
                        DROPOUT,
                        PAD_IDX).to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

In [16]:
best = -float('inf')
for epoch in range(num_epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    best_so_far = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best)

Epoch 0: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 0: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 1: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 1: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 2: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 2: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 3: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 3: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 4: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 4: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 5: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 5: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 6: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 6: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 7: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 7: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 8: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 8: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 9: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 9: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 10: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 10: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 11: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 11: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 12: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 12: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 13: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 13: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 14: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 14: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 15: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 15: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 16: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 16: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 17: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 17: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 18: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 18: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 19: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 19: val:   0%|          | 0/91 [00:00<?, ?it/s]

# Predictions

In [17]:
test['tag'] = 'X'
test_dataset = PosTaggingDataset(test, vocab=train_dataset.vocab)

In [18]:
batch_size = 128

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch: list):
    sentences_batch, sentences_lengths = [], []
    max_size = max([len(sent) for sent, _ in batch])
    for _sent, _ in batch:
        _sent_tensor = torch.Tensor(_sent)
        sentences_lengths.append(len(_sent))

        sent_padding = torch.Tensor([1] * (max_size - len(_sent)))
        sentences_batch.append(torch.concat((_sent_tensor, sent_padding)))

    return torch.stack(sentences_batch, dim=0).int().T.to(device), sentences_lengths

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch)

In [19]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Predictions",
        leave=True,
    )
    predictions = []
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, (texts, real_lengths) in loop:

            # forward pass and loss calculation
            outputs = model(texts)

            for idx, length in enumerate(real_lengths):
                _, predicted = torch.max(outputs.data[:, idx], 1)
                predictions += predicted[:length].detach().cpu().tolist()

    return predictions

In [20]:
ckpt = torch.load("best.pt")
model.load_state_dict(ckpt)

predictions = predict(model, test_dataloader)
predictions[:10]

Predictions:   0%|          | 0/113 [00:00<?, ?it/s]

[1, 4, 5, 5, 10, 5, 7, 5, 5, 9]

In [21]:
results = pd.Series(predictions).apply(lambda x: idx2cat[x])

In [22]:
results

Unnamed: 0,0
0,ADP
1,DET
2,NOUN
3,NOUN
4,.
...,...
303020,NOUN
303021,PRT
303022,VERB
303023,NOUN
