##### Libraries

In [1]:
########################## UTILITY AND SYSTEM ##########################

import os                       # filesystem operations
import csv                      # reading/writing CSV files
import json                     # JSON parsing and serialization
import math                     # basic math functions
import random                   # random number generation
import time                     # time-related functions
import tempfile                 # temporary file management
import tarfile                  # tar archive handling
import io                       # input/output streams
import pickle                   # object serialization
import importlib                # dynamic import of modules
import multiprocessing          # parallel process management
import pkg_resources            # package and dependency management
from copy import deepcopy       # deep copy of objects
from pathlib import Path        # filesystem paths handling (cross-platform)

########################## DOWNLOAD ##########################

import requests                 # HTTP requests library
import wget                     # file downloads from URLs
from urllib.request import urlopen  # open URLs (alternative to requests)

########################## VISUALIZATION ##########################

import matplotlib.pyplot as plt # basic plotting library
import plotly.graph_objs as go  # interactive plotting
from tqdm.notebook import tqdm  # progress bars for loops in notebooks
from pprint import pprint       # formatted pretty-printing of objects

########################## DATAFRAME ##########################

import numpy as np              # numerical arrays and operations
import pandas as pd             # dataframes and data manipulation

########################## TEXT PROCESSING ##########################

import re                      # regular expressions
import string                  # string constants and operations
from itertools import chain, islice  # advanced iteration and chaining

########################## TOKENIZATION ##########################

from collections import Counter, OrderedDict  # frequency counts and ordered dictionaries
import nltk                                   # natural language processing toolkit
from nltk.tokenize import word_tokenize       # word tokenization
import spacy                                  # advanced NLP (tokenization, parsing)
from torchtext.data.utils import get_tokenizer       # torchtext tokenizers
from torchtext.data.functional import to_map_style_dataset

from torchtext.vocab import build_vocab_from_iterator # build vocabulary from iterator

########################## DATASET AND DATALOADER ##########################

from torch.utils.data import Dataset, DataLoader, random_split   # datasets and data loading utilities
from torch.nn.utils.rnn import pad_sequence                      # padding variable-length sequences
from datasets import load_dataset, DatasetDict                   # HuggingFace datasets loading
from torchtext.datasets import AG_NEWS                           # torchtext built-in datasets

########################## PYTORCH AND DEEP LEARNING ##########################

import torch                             # PyTorch main library
from torch import nn, Tensor             # neural network modules and tensors
from torch.nn import CrossEntropyLoss    # common loss function for classification

########################## WORD EMBEDDING ##########################

from torchtext.vocab import GloVe        # pretrained GloVe embeddings
# from gensim.models import Word2Vec     # word2vec embeddings from corpus (commented out)

########################## HUGGING FACE ##########################

import transformers                      # transformers library core
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,     # GPT-2 tokenizer and model
    BertTokenizer, BertTokenizerFast, BertConfig, BertForMaskedLM,  # BERT components
    XLNetTokenizer,                     # XLNet tokenizer
    DistilBertForSequenceClassification, DistilBertTokenizer, AutoModelForSequenceClassification,
    pipeline,                          # easy pipelines for inference
    AutoTokenizer,                    # auto tokenizer loader
    AutoModelForCausalLM, GPT2ForSequenceClassification,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer,  # training utilities
    set_seed, GenerationConfig,
    BertModel                        # BERT base model
)
from datasets import DatasetDict         # HuggingFace dataset dictionaries

######################### TRL & PEFT (TRAINING & PARAMETER EFFICIENT FINE-TUNING) ##########################

from trl import (
    SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM,
    DPOConfig, DPOTrainer,
    RewardTrainer, RewardConfig
)
from peft import get_peft_model, LoraConfig, TaskType
from torchmetrics import Accuracy        # metrics for evaluation

########################## RAG ##########################

from transformers import (
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
    DPRContextEncoder, DPRContextEncoderTokenizer
)
import faiss                              # similarity search library

########################## EVALUATION ##########################

import evaluate



  import pkg_resources            # package and dependency management


# 0) Observation on yeld_tokens function

```
tokenizer = get_tokenizer('basic_english')
tokenized_dataset = map(lambda x: tokenizer(x[1]), dataset)  # dataset[i][1] is the text
vocab = build_vocab_from_iterator(tokenized_dataset, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"]) 
```

and 
```
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>"])
```
are equivalent. If we have the dataset already in the memory (es. dataset = [(1,"Introduction to NLP"),(2,"Basics of PyTorch")] ) we can use both, but if we are extracting a dataset from torchtext for example, **we must use yeld_tokens**, because the dataset itself is an iterator and we can not obtain the sentences as a list.

In [2]:
def yield_tokens(data_iter): #this build an iterator
    for index,text in data_iter:
        yield tokenizer(text)

# 1) Tokenization

<center>
<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMSkillsNetwork-AI0201EN-Coursera/images/Tokenization%20lab%20Diagram%202.png" width="50%" alt="Image Description">
</center>

## Word-based (spacy, nltk)
Preserves semantic meaning, increases overall vocabulary; it assigns different IDs to 'unicorn' and 'unicorns'

Import spacy models if necessary:

In [3]:
# import subprocess

# DO_INSTALL = 0

# if DO_INSTALL:
#     subprocess.check_call([
#         "python", "-m", "spacy", "download", "de_core_news_sm"
#     ])

In [4]:
text = "Unicorns are real. I saw a unicorn yesterday. I couldn't see it today"

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
token_list = [token.text for token in doc]
print(f"Tokens (spacy): {token_list}")
print(doc[0].text, doc[0].pos_, doc[0].dep_) #details of a token

nltk.download("punkt_tab")
token_list = word_tokenize(text)
print(f"Tokens (nltk ): {token_list}")

Tokens (spacy): ['Unicorns', 'are', 'real', '.', 'I', 'saw', 'a', 'unicorn', 'yesterday', '.', 'I', 'could', "n't", 'see', 'it', 'today']
Unicorns NOUN nsubj
Tokens (nltk ): ['Unicorns', 'are', 'real', '.', 'I', 'saw', 'a', 'unicorn', 'yesterday', '.', 'I', 'could', "n't", 'see', 'it', 'today']


[nltk_data] Downloading package punkt_tab to /Users/alex/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Character-based
Small vocabulary but without semantic meaning.

## Subword-based (WordPiece, Unigram, SentencePiece)
Frequently used words stay unplit, while breaking down infrequent words. 'Unicorns' becomes 'unicorn' and 's'.


The ## means that they **are not** new words; the _ means that they **are** new words.

In [5]:
text = "Unicorns are real. I saw a unicorn yesterday. I couldn't see it today"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # the WordPiece tokenizer is implemented in BertTokenizer
token_list = tokenizer.tokenize(text)
print(f"Tokens (bert): {token_list}")

tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") # Unigram and SentencePiece 
token_list = tokenizer.tokenize(text)
print(f"Tokens (xlnet): {token_list}")


Tokens (bert): ['unicorn', '##s', 'are', 'real', '.', 'i', 'saw', 'a', 'unicorn', 'yesterday', '.', 'i', 'couldn', "'", 't', 'see', 'it', 'today']


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Tokens (xlnet): ['▁Uni', 'corn', 's', '▁are', '▁real', '.', '▁I', '▁saw', '▁a', '▁', 'uni', 'corn', '▁yesterday', '.', '▁I', '▁couldn', "'", 't', '▁see', '▁it', '▁today']


## Using PyTorch (Words and sub-words)

In PyTorch, especially with the torchtext library, the tokenizer breaks down text from a data set into individual words or subwords, facilitating their conversion into numerical format. After tokenization, the vocab (vocabulary) maps these tokens to unique integers, allowing them to be fed into neural networks. This process is vital because deep learning models operate on numerical data and cannot process raw text directly. Thus, tokenization and vocabulary mapping serve as a bridge between human-readable text and machine-operable numerical data.

In [6]:
dataset = [ #it is an iterable, like a dataloader (think to the future application in DL)
    (1,"Introduction to NLP"), #the first entry is a sentiment label
    (2,"Basics of PyTorch"),
    (1,"NLP Techniques for Text Classification"),
    (3,"Named Entity Recognition with PyTorch")]

tokenizer = get_tokenizer('basic_english') # TorchText tokenizer
token_list = tokenizer(dataset[0][1])
print(f"Tokens (PyTorch): {token_list}")

def yield_tokens(data_iter): #this build an iterator
    for sentiment, text in data_iter:
        yield tokenizer(text)

my_iterator = yield_tokens(dataset) 
print(next(my_iterator))


Tokens (PyTorch): ['introduction', 'to', 'nlp']
['introduction', 'to', 'nlp']


Then we build a vocabulary:

In [7]:
vocab = build_vocab_from_iterator(yield_tokens(dataset), specials = ['<unk>']) #assigns numbers to token, and '<unk>' to words out of vocabulary
vocab.set_default_index(vocab['<unk>'])

print('Number of words in the vocabulary:',len(vocab), '\n')
vocab.get_itos() #**index_to_string** this is a list of the words in the vocabulary, already sorted by indexing (see the last two lines here)
print('Vocabulary:',vocab.get_stoi(),'\n') #this is a dictionary {'word in vocabulary': integer ID}

print(vocab.get_stoi()['with'])
print(vocab.get_itos()[14])

Number of words in the vocabulary: 15 

Vocabulary: {'with': 14, 'to': 13, 'of': 9, 'nlp': 1, 'classification': 4, 'named': 8, 'for': 6, 'text': 12, 'entity': 5, 'techniques': 11, '<unk>': 0, 'basics': 3, 'recognition': 10, 'pytorch': 2, 'introduction': 7} 

14
with


In [8]:
my_iterator = yield_tokens(dataset) 

def get_tokenized_sentences_and_IDs(iterator, vocab):
    for tokenized_sentence in iterator:
        token_indices = [vocab[token] for token in tokenized_sentence]
        print("Tokenized Sentence:", tokenized_sentence)
        print("Token Indices:", token_indices)
        print()  # riga vuota per separare

get_tokenized_sentences_and_IDs(my_iterator,vocab)

Tokenized Sentence: ['introduction', 'to', 'nlp']
Token Indices: [7, 13, 1]

Tokenized Sentence: ['basics', 'of', 'pytorch']
Token Indices: [3, 9, 2]

Tokenized Sentence: ['nlp', 'techniques', 'for', 'text', 'classification']
Token Indices: [1, 11, 6, 12, 4]

Tokenized Sentence: ['named', 'entity', 'recognition', 'with', 'pytorch']
Token Indices: [8, 5, 10, 14, 2]



In [9]:
text = """
Going through the world of tokenization has been like walking through a huge maze made of words, symbols, and meanings. Each turn shows a bit more about the cool ways computers learn to understand our language. And while I'm still finding my way through it, the journey’s been enlightening and, honestly, a bunch of fun.
Eager to see where this learning path takes me next!"
"""

# Counting and displaying tokens and their frequency

def show_frequencies(tokens, method_name):
    print(f"{method_name} Token Frequencies: {dict(Counter(tokens))}\n")

# 2) Dataset and Dataloader

## General

In the Dataset class, the methods len e getitem are not defined because they depend strictly on the type on objects we have in the databse. So it is common practice to define a CustomDataset class, with the methods len e getitem!

DataLoader is an iterable but not an iterator:
1. data_iterator = iter(dataloader)
2. first_batch = next(data_iterator)



In [10]:
sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

class CustomDataset(Dataset):
    def __init__(self, sentences, tokenizer = None, vocabulary = None):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]
    

# Create an instance of your custom dataset
custom_dataset = CustomDataset(sentences)
len(custom_dataset) ==  custom_dataset.__len__() #number of sentences, method __len__
# custom_dataset[i] == custom_dataset.__getitem__(i) #is the i-th sentence, method __getitem

dataloader = DataLoader(dataset = custom_dataset, batch_size = 2, shuffle = True)
len(dataloader) # number of batches, of dimension 'batch_size'

# Iterate through the DataLoader
for batch in dataloader:
    print(batch)

['You are awesome!', "Fame's a fickle friend, Harry."]
['Soon we must all face the choice between what is right and what is easy.', 'It is our choices, Harry, that show what we truly are, far more than our abilities.']
['Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.', "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals."]


## For NLP purposes

### Modifying the dataset (not good)

For a NLP task, we initialize the class by passing the sentences, a tokenizer, and a vocabulary. Problem is thata dataloader expects all data to have the same lenght, so we have to add padding manually, ptherwise we eill encounter an error:

In [11]:
sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

class CustomDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocabulary):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocabulary = vocabulary

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.sentences[idx])
        # Convert tokens to tensor indices using vocab
        tensor_indices = [self.vocabulary[token] for token in tokens]
        return torch.tensor(tensor_indices)
    
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, sentences))

custom_dataset = CustomDataset(sentences, tokenizer, vocab)
custom_dataset[5] #IDs of the 5-th sentence

# dataloader = DataLoader(custom_dataset, batch_size= 2 , shuffle=True)
# for batch in dataloader:
#     print(batch)              #this will arise an error

def collate_fn(batch):
    padded_batch = pad_sequence(batch, batch_first = True, padding_value = 0) # padding_value numerical value of the padding
    return padded_batch

dataloader = DataLoader(dataset = custom_dataset, batch_size = 2 , shuffle = True, collate_fn = collate_fn)
for batch in dataloader:
    print(batch) #batch is [batch_size x sequence_lenght] if batch_first = True, otherwise (default) [sequence_lenght x batch_size]
    print(f'batch shape: {batch.shape}')
    for row in batch:
        for idx in row:
            words = [vocab.get_itos()[idx] for idx in row]
        print(words)
    print()

tensor([[12,  5, 15, 31,  0,  8,  0, 57, 53,  2, 18, 62,  4,  0, 36, 49, 56, 15,
         21,  1],
        [19,  4, 25, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
batch shape: torch.Size([2, 20])
['it', 'is', 'our', 'choices', ',', 'harry', ',', 'that', 'show', 'what', 'we', 'truly', 'are', ',', 'far', 'more', 'than', 'our', 'abilities', '.']
['you', 'are', 'awesome', '!', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']

tensor([[54, 18, 50, 23, 34, 58, 30, 27,  2,  5, 52,  7,  2,  5, 32,  1,  0,  0,
          0,  0,  0,  0,  0,  0,  0],
        [66, 29, 14, 13, 10, 22, 60,  7, 37,  1, 28, 51, 48,  4, 42, 11, 59, 39,
          2, 12, 64, 17, 26, 65,  1]])
batch shape: torch.Size([2, 25])
['soon', 'we', 'must', 'all', 'face', 'the', 'choice', 'between', 'what', 'is', 'right', 'and', 'what', 'is', 'easy', '.', ',', ',', ',', ',', ',', ',', ',', ',', ',']
['youth', 'can', 'not', 'know', 'how', 'age', 'thinks', 'and', 'fee

### Without modifying the dataset: best practice

We have the option to utilize the collate function for tasks such as tokenization, converting tokenized indices, and transforming the result into a tensor. It's important to note that the original data set remains untouched by these transformations. In this way we can still access the raw data as custom_dataset[i]

In [12]:
sentences = [
    "If you want to know what a man's like, take a good look at how he treats his inferiors, not his equals.",
    "Fame's a fickle friend, Harry.",
    "It is our choices, Harry, that show what we truly are, far more than our abilities.",
    "Soon we must all face the choice between what is right and what is easy.",
    "Youth can not know how age thinks and feels. But old men are guilty if they forget what it was to be young.",
    "You are awesome!"
]

class CustomDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

custom_dataset = CustomDataset(sentences) #still have the raw text!

def my_collate_fn(batch):
    tensor_batch = []
    for sentence in batch:
        tokens = tokenizer(sentence)
        tensor_batch.append(torch.tensor([vocab[token] for token in tokens]))
    padded_batch = pad_sequence(tensor_batch, batch_first=True)
    return padded_batch

tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, sentences))

dataloader = DataLoader(
    dataset = custom_dataset,   # Custom PyTorch Dataset containing your data
    batch_size = 2,     # Number of samples in each mini-batch
    shuffle = True,              # Shuffle the data at the beginning of each epoch
    collate_fn = my_collate_fn      # Custom collate function for processing batches
)

for batch in dataloader:
    print(batch)

print()
for batch in dataloader:
    print(batch) #batch is [batch_size x sequence_lenght] if batch_first = True, otherwise (default) [sequence_lenght x batch_size]
    print(f'batch shape: {batch.shape}')
    for row in batch:
        for idx in row:
            words = [vocab.get_itos()[idx] for idx in row]
        print(words)
    print()

tensor([[35,  6, 16,  3, 38, 40,  0,  8,  1],
        [19,  4, 25, 20,  0,  0,  0,  0,  0]])
tensor([[12,  5, 15, 31,  0,  8,  0, 57, 53,  2, 18, 62,  4,  0, 36, 49, 56, 15,
         21,  1],
        [54, 18, 50, 23, 34, 58, 30, 27,  2,  5, 52,  7,  2,  5, 32,  1,  0,  0,
          0,  0]])
tensor([[11, 19, 63, 17, 13,  2,  3, 47,  6, 16, 45,  0, 55,  3, 41, 46, 24, 10,
         43, 61,  9, 44,  0, 14,  9, 33,  1],
        [66, 29, 14, 13, 10, 22, 60,  7, 37,  1, 28, 51, 48,  4, 42, 11, 59, 39,
          2, 12, 64, 17, 26, 65,  1,  0,  0]])

tensor([[11, 19, 63, 17, 13,  2,  3, 47,  6, 16, 45,  0, 55,  3, 41, 46, 24, 10,
         43, 61,  9, 44,  0, 14,  9, 33,  1],
        [54, 18, 50, 23, 34, 58, 30, 27,  2,  5, 52,  7,  2,  5, 32,  1,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0]])
batch shape: torch.Size([2, 27])
['if', 'you', 'want', 'to', 'know', 'what', 'a', 'man', "'", 's', 'like', ',', 'take', 'a', 'good', 'look', 'at', 'how', 'he', 'treats', 'his', 'inferiors', ',', 'n