# Assignment 1 - Part of Speech Tagging

## Dependencies

In [40]:
# !pip install lightning
# !pip install torchtext.data
# !pip install torchtext
# !pip install torch

In [41]:
# TODO: remove unused dependencies

# file management
import sys
import shutil
import urllib
import tarfile
from pathlib import Path
import zipfile

# dataframe management
import pandas as pd

# data manipulation
import numpy as np

# for readability
from typing import Iterable
from tqdm import tqdm

## TASK 1: Corpus

### Instructions

* **Download** the corpus.
* **Encode** the corpus into a pandas.DataFrame object.
* **Split** it in training, validation, and test sets.

#### Download the corpus

In [42]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(download_path: Path, url: str):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=download_path, reporthook=t.update_to)

        
def download_dataset(download_path: Path, url: str):
    print("Downloading dataset...")
    download_url(url=url, download_path=download_path)
    print("Download complete!")

def extract_dataset(download_path: Path, extract_path: Path):
    print("Extracting dataset... (it may take a while...)")
    with zipfile.ZipFile(download_path, 'r') as zip_file:
        zip_file.extractall(extract_path)

    print("Extraction completed!")

    Path.unlink(download_path)
    print("Deleted .zip dataset file")

In [44]:
url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"
dataset_name = "dependency_treebank"

print(f"Current work directory: {Path.cwd()}")

dataset_folder = Path.cwd().joinpath("Datasets")

if not dataset_folder.exists():
    dataset_folder.mkdir(parents=True)

dataset_zip_path = dataset_folder.joinpath("dependency_treebank.zip")
dataset_path = dataset_folder.joinpath(dataset_name)

if not dataset_zip_path.exists():
    download_dataset(dataset_zip_path, url)

if not dataset_path.exists():
    extract_dataset(dataset_zip_path, dataset_folder)
  

Current work directory: c:\Users\merli\OneDrive\Documenti\University\Anno 5\Natural Language Processing\NLP\A1
Downloading dataset...


dependency_treebank.zip: 459kB [00:00, 1.15MB/s]                            


Download complete!
Extracting dataset... (it may take a while...)
Extraction completed!
Deleted .zip dataset file


#### Encode the corpus into a pandas.DataFrame object and split it into train, validation and test sets

The corpus contains 200 documents.

   * **Train**: Documents 1-100
   * **Validation**: Documents 101-150
   * **Test**: Documents 151-199

In [30]:
dataframe_rows = []  # list for DataFrame rows
id = 0

for i, file_path in enumerate(sorted(dataset_path.iterdir())):
    if file_path.is_file(): # split corpus documents in the tree categories: train, validation, tests
        if 1 <= i + 1 <= 100:
            split = 'train'
        elif 101 <= i + 1 <= 150:
            split = 'validation'
        else:
            split = 'test'

        with file_path.open(mode='r', encoding='utf-8') as text_file: # read corpus lines
            lines = text_file.readlines()
                
        for line in lines:
            fields = line.strip().split('\t')
            if len(fields) == 1:
                id = id + 1
            if len(fields) >= 2:
                text = fields[0]  # store the first field as 'text'
                POS = fields[1]   # store the second field as 'POS'
                dataframe_row = {  #build DataFrame rows
                    "text": text,
                    "POS": POS,
                    "split": split,
                    "id": id
                }

                dataframe_rows.append(dataframe_row) #append rows
# corpus DataFrame
corpus_df = pd.DataFrame(dataframe_rows) 

#### Data inspection

In [31]:
corpus_df.head(10)

Unnamed: 0,text,POS,split,id
0,Pierre,NNP,train,0
1,Vinken,NNP,train,0
2,",",",",train,0
3,61,CD,train,0
4,years,NNS,train,0
5,old,JJ,train,0
6,",",",",train,0
7,will,MD,train,0
8,join,VB,train,0
9,the,DT,train,0


In [32]:
# Train, test, validation split
df_train = corpus_df[corpus_df['split'] == 'train'].drop(columns=['split'])
df_test = corpus_df[corpus_df['split'] == 'test'].drop(columns=['split'])
df_val = corpus_df[corpus_df['split'] == 'validation'].drop(columns=['split'])

In [33]:
print("Dataframe structure:")
print(corpus_df)
print()

print("Total rows %d" % (len(corpus_df)))
print()

Dataframe structure:
          text  POS  split    id
0       Pierre  NNP  train     0
1       Vinken  NNP  train     0
2            ,    ,  train     0
3           61   CD  train     0
4        years  NNS  train     0
...        ...  ...    ...   ...
94079  quarter   NN   test  3715
94080       of   IN   test  3715
94081     next   JJ   test  3715
94082     year   NN   test  3715
94083        .    .   test  3715

[94084 rows x 4 columns]

Total rows 94084



## TASK 2: Text encoding

### Instructions

* Embed words using **GloVe embeddings**.
* You are **free** to pick any embedding dimension.
* [Optional] You are free to experiment with text pre-processing: **make sure you do not delete any token!**

### Embed words unsing GloVe embeddings

Encode text into numerical format

In [34]:
from torchtext.vocab import GloVe, build_vocab_from_iterator

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import pytorch_lightning as pl
from lightning.pytorch import loggers as pl_logger

In [35]:
def load_embedding_model(embedding_dimension: int = 300):
    emb_model = GloVe(name="6B", dim=embedding_dimension)
    return emb_model

In [36]:
iterator = ([text] for text in corpus_df["POS"].unique())
vocab = build_vocab_from_iterator(iterator)


class CorpusDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, embedder):
        min_id = dataframe['id'].min()
        dataframe['id'] = dataframe['id'] - min_id
        self.dataframe = dataframe.groupby("id")
        self.embedder = embedder

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sentence = self.dataframe.get_group(idx)
        text = sentence['text'].to_list()
        POS = sentence['POS'].to_list()
        
        POS = torch.Tensor([vocab[token] for token in POS])
        
        POS_one_hot = torch.nn.functional.one_hot(POS.to(torch.int64), num_classes=len(vocab))
        embedded_text = self.embedder.get_vecs_by_tokens(text)
        
        return embedded_text, POS_one_hot


In [39]:
# Definition of the dataset
embedder = load_embedding_model(50)
dataset_train = CorpusDataset(df_train, embedder)
dataset_test = CorpusDataset(df_test, embedder)
dataset_val = CorpusDataset(df_val, embedder)


# TODO - test if it works in the LSTM training
def my_collate(batch):
    sequences, labels = zip(*batch)
    sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0)
    labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
    
    
    sequences_padded = torch.tensor(sequences_padded, dtype=torch.float)
    labels_padded = torch.tensor(labels_padded, dtype=torch.long)
    
    
    return [sequences_padded, labels_padded]

train_loader = DataLoader(dataset_train, batch_size=3, collate_fn=my_collate, num_workers=0)
val_loader = DataLoader(dataset_val, batch_size=3, collate_fn=my_collate, num_workers=0)
test_loader = DataLoader(dataset_test, batch_size=3, collate_fn=my_collate, num_workers=0)

.vector_cache\glove.6B.zip:  15%|█▌        | 130M/862M [14:20<1:20:44, 151kB/s]    


ContentTooShortError: <urlopen error retrieval incomplete: got only 130105085 out of 862182613 bytes>

In [308]:
next(train_loader.__iter__())

  sequences_padded = torch.tensor(sequences_padded, dtype=torch.float)
  labels_padded = torch.tensor(labels_padded, dtype=torch.long)


[tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
 
         [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.6185,  0.6425, -0.4655,  ..., -0.2756,  0.3090,  0.4850],
          ...,
          [ 1.0590,  0.0835,  0.4888,  ...,  0.7931, -0.7122,  0.1770],
          [ 0.5957, -0.2463,  0.7457,  ...,  1.4178,  0.4205, -0.0281],
          [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022]],
 
         [[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.4703,  0.0987,

In [309]:
class BiLSTMModel(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiplied by 2 due to the bidirectionality

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        # lstm_out (batch_size, seq_length, hidden_size * 2)
        out = self.fc(lstm_out)
        # out (batch_size, seq_length, output_dim)
        return out

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.functional.cross_entropy(y_hat, y)
        
        self.log('test_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


In [310]:
len(df_train["POS"].unique())

45

In [311]:
output_dim = len(df_train["POS"].unique())
input_dim = 50  # Edo: shouldn't this be equal to the embedding dimension (300)?

model = BiLSTMModel(input_dim=input_dim, hidden_dim=128, output_dim=output_dim, num_layers=1)


tb_logger = pl_loggers.TensorBoardLogger("logs/")
trainer = pl.Trainer(max_epochs=10, logger=tb_logger)

trainer.fit(model, train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.

  | Name | Type   | Params
--------------------------------
0 | lstm | LSTM   | 184 K 
1 | fc   | Linear | 11.6 K
--------------------------------
195 K     Trainable params
0         Non-trainable params
195 K     Total params
0.784     Total estimated model params size (MB)
C:\Users\Matteo\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

  sequences_padded = torch.tensor(sequences_padded, dtype=torch.float)
  labels_padded = torch.tensor(labels_padded, dtype=torch.long)


torch.Size([3, 75, 256])
torch.Size([3, 75, 45])
y shape: torch.Size([3, 75, 45])
y_hat shape: torch.Size([3, 75, 45])


RuntimeError: Expected floating point type for target with class probabilities, got Long