As it is (without anything downloaded and always with 1 epoch when needed), the running time of the whole notebook is (approximately) <span style="background-color: lightblue"> 1 minutes</span>.

<span style="background-color: yellow"> </span>

#### Libraries

In [1]:
########################## UTILITY AND SYSTEM ##########################

import os                       # filesystem operations
import csv                      # reading/writing CSV files
import json                     # JSON parsing and serialization
import math                     # basic math functions
import random                   # random number generation
import time                     # time-related functions
import tempfile                 # temporary file management
import tarfile                  # tar archive handling
import io                       # input/output streams
import pickle                   # object serialization
import importlib                # dynamic import of modules
import multiprocessing          # parallel process management
import pkg_resources            # package and dependency management
from copy import deepcopy       # deep copy of objects
from pathlib import Path        # filesystem paths handling (cross-platform)

########################## DOWNLOAD ##########################

import requests                 # HTTP requests library
import wget                     # file downloads from URLs
from urllib.request import urlopen  # open URLs (alternative to requests)

########################## VISUALIZATION ##########################

import matplotlib.pyplot as plt # basic plotting library
import plotly.graph_objs as go  # interactive plotting
from tqdm.notebook import tqdm  # progress bars for loops in notebooks
from pprint import pprint       # formatted pretty-printing of objects

########################## DATAFRAME ##########################

import numpy as np              # numerical arrays and operations
import pandas as pd             # dataframes and data manipulation
from sklearn.manifold import TSNE

########################## TEXT PROCESSING ##########################

import re                      # regular expressions
import string                  # string constants and operations
from itertools import chain, islice  # advanced iteration and chaining

########################## TOKENIZATION ##########################

from collections import Counter, OrderedDict  # frequency counts and ordered dictionaries
import nltk                                   # natural language processing toolkit
from nltk.tokenize import word_tokenize       # word tokenization
import spacy                                  # advanced NLP (tokenization, parsing)
from torchtext.data.utils import get_tokenizer       # torchtext tokenizers
from torchtext.data.functional import to_map_style_dataset

from torchtext.vocab import build_vocab_from_iterator # build vocabulary from iterator

########################## DATASET AND DATALOADER ##########################

from torch.utils.data import Dataset, DataLoader, random_split   # datasets and data loading utilities
from torch.nn.utils.rnn import pad_sequence                      # padding variable-length sequences
from datasets import load_dataset, DatasetDict                   # HuggingFace datasets loading
from torchtext.datasets import AG_NEWS                           # torchtext built-in datasets

########################## PYTORCH AND DEEP LEARNING ##########################

import torch                             # PyTorch main library
from torch import nn, Tensor             # neural network modules and tensors
from torch.nn import CrossEntropyLoss    # common loss function for classification
from torchsummary import summary as torchsummary
from torchinfo import summary as torchinfosummary

########################## WORD EMBEDDING ##########################

from torchtext.vocab import GloVe        # pretrained GloVe embeddings
# from gensim.models import Word2Vec     # word2vec embeddings from corpus (commented out)

########################## HUGGING FACE ##########################

import transformers                      # transformers library core
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,     # GPT-2 tokenizer and model
    BertTokenizer, BertTokenizerFast, BertConfig, BertForMaskedLM,  # BERT components
    XLNetTokenizer,                     # XLNet tokenizer
    DistilBertForSequenceClassification, DistilBertTokenizer, AutoModelForSequenceClassification,
    pipeline,                          # easy pipelines for inference
    AutoTokenizer,                    # auto tokenizer loader
    AutoModelForCausalLM, GPT2ForSequenceClassification,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer,  # training utilities
    set_seed, GenerationConfig,
    BertModel,                        # BERT base model
    PreTrainedTokenizerBase
)
from datasets import DatasetDict         # HuggingFace dataset dictionaries

######################### TRL & PEFT (TRAINING & PARAMETER EFFICIENT FINE-TUNING) ##########################

from trl import (
    SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM,
    DPOConfig, DPOTrainer,
    RewardTrainer, RewardConfig
)
from peft import get_peft_model, LoraConfig, TaskType
from torchmetrics import Accuracy        # metrics for evaluation

########################## RAG ##########################

from transformers import (
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
    DPRContextEncoder, DPRContextEncoderTokenizer
)
import faiss                              # similarity search library

########################## EVALUATION ##########################

import evaluate



  import pkg_resources            # package and dependency management
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
def accelerator(where = "mps"):
    if where == "mps":
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        print("Which device we are on: {}".format(device))
        return device
    if where == "cuda":
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Which device we are on: {}".format(device))
        return device
    if where == "cpu":
        device = torch.device("cpu")
        print("Which device we are on: {}".format(device))
        return device

device = accelerator("cpu")

Which device we are on: cpu


# 0) CONCEPTS: RAG, FAISS, Prompt Engineering, LangChain

## Retrival Augmented Generation (RAG) and Facebook AI Similarity (FAISS)

**RAG** is a framework that helps optimize the output of LLMs without re-training the model, and by using ( for example) an internal database of a company. To do this, RAG comprises two main components:
1. **The retriver**: the retriver combine:
    1. **encoded prompt**: a high-dimensional vectorial representation of the prompt (which is translated in a vector using a **question encoder**, in the gray boc). The question encoder, at it ends, does an average;
    2. **relevant context**: an 'internal' database (obtained using a **context encoder**, in the lightblue box, from internal documents of the company). ;

   The retrival combine the relevant context and the prompt matching similar vectors in the embedded spaces. As vector similarities, we can use the dot product for the magnitude and the cosine similarity for the direction.

2. **The generator**: using the data from the retriver, it answers to the user using a **decoder** (use `BartForConditionalGeneration` and `BartTokenizer`)



<img src="https://miro.medium.com/v2/resize:fit:1400/1*wMNhcGsiwDqVyxG1CFCj4w.jpeg" width="500">


For the context encoder use `DPRContextEncoderTokenizer` from `transformers`, which reads list of tuples, and `DPRContextEncoder` .

For the question encoder use `DPRQuestionEncoderTokenizer` and `DPRQuestionEncoder`

Library for compute the distance importing `faiss`. 

## In-context learning and prompt engineering

**In-context learning** is a method of doing prompt engineering, and in particular we give to the model demonstration of the task provided.
- Advantages: 
    1. Does not require fine-tuning --> reduce time
    2. Improve performances
- Disadvantages:
    1. Limited to what fit in-context (what example can I realistically include in the prompt?)
    2. Complex tasks may require gradient steps and adjustments based on gradients
**Prompt Engineering**: prompts are divided in instructions and context (necessary background to do the task). PI is about how to ask a LLM questions in the best way possible. It is crucial to:
1. One-shot prompt: give one example of i.e. translation before asking to translate a sentence;
2. Few-shot prompts: giving some example of sentiment analysis before asking a new one;
3. Chain of thought: give an example and break it into steps for the solution to be effective;
4. Self consistency: 'when I was 6 my sister was half my age. Now I am 50, what age is my sister? Provide N independent calculations and explanations, then determine the most consistent result'. The model at the end choose the most frequent answer.

Where test the prompts? 
1. Playground;
2. LangChain: uses prompt templates, which include few shot examples.
3. HuggingFace;
4. IBM AI classroom;

## LangChain (chain of commands!)

Components of LangChain:

1. Language model: foundation of LLMs, using IBM, OpenAI, Google and Meta as primary language models
2. Chat model: efficient conversation
3. Chat message: efficient messages
4. Prompt templates: translate user questions into clear instructions
5. Output parser: transforms the output in suitable structured data


We can use LangChain Documents to use RAG, and also to build applications (unifying chains, or sequence of calls!).

In recent years, the development of Large Language Models (LLMs) like GPT-3 and GPT-4 has revolutionized the field of natural language processing (NLP). These models are capable of performing a wide range of tasks, from generating coherent text to answering questions and summarizing information. Their effectiveness, however, is not without limitations. One significant constraint is the context window length, which affects how much information can be processed at once. LLMs operate within a fixed context window, measured in tokens, with GPT-3 having a limit of 4096 tokens and GPT-4 extending to 8192 tokens. When dealing with lengthy documents, attempting to input the entire text into the model's prompt can lead to truncation, where essential information is lost, and increased computational costs due to the processing of large inputs.

These limitations become particularly pronounced when creating a retrieval-based question-answering (QA) assistant. The context length constraint restricts the ability to input all content into the prompt simultaneously, leading to potential loss of critical context and details. This necessitates the development of sophisticated strategies for selectively retrieving and processing relevant sections of the document. Techniques such as chunking the document into manageable parts, employing summarization methods, and using external retrieval systems are crucial to address these challenges. Understanding and mitigating these limitations are essential for designing effective QA systems that leverage the full potential of LLMs while navigating their inherent constraints.

# 1) RAG with HuggingFace

For the RAG we will use:

```
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
```

```
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
```

```
decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
decoder = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
```

## Dataset (context) preparation

We will use the following text:

In [3]:
filename = 'companyPolicies.txt'
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/6JDbUb_L3egv_eOkouY71A.txt'

# Use wget to download the file
wget.download(url, out = filename)
print('file downloaded')

with open("companyPolicies.txt", 'r', encoding='utf-8') as file:
        text = file.read()
print(text)

file downloaded
1.	Code of Conduct

Our Code of Conduct outlines the fundamental principles and ethical standards that guide every member of our organization. We are committed to maintaining a workplace that is built on integrity, respect, and accountability.
Integrity: We hold ourselves to the highest ethical standards. This means acting honestly and transparently in all our interactions, whether with colleagues, clients, or the broader community. We respect and protect sensitive information, and we avoid conflicts of interest.
Respect: We embrace diversity and value each individual's contributions. Discrimination, harassment, or any form of disrespectful behavior is unacceptable. We create an inclusive environment where differences are celebrated and everyone is treated with dignity and courtesy.
Accountability: We take responsibility for our actions and decisions. We follow all relevant laws and regulations, and we strive to continuously improve our practices. We report any potentia

We split it into paragraphs, with `text.split('\n')`:

In [4]:
def read_and_split_text(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    # Split the text into paragraphs (simple split by newline characters)
    paragraphs = text.split('\n')
    # Filter out any empty paragraphs or undesired entries
    paragraphs = [para.strip() for para in paragraphs if len(para.strip()) > 0]
    return paragraphs

# Read the text file and split it into paragraphs

paragraphs = read_and_split_text('companyPolicies.txt')
random.shuffle(paragraphs) #shuffling samples so that the samples are not ordered based on the category they belong to
paragraphs[2]

'8.\tAnti-discrimination and Harassment Policy'

## Define the pre-trained context encoder

Let's use the Dense Passage Retriever (DPR) model, specifically the context encoder, to convert your preprocessed text data into dense vector embeddings. These embeddings capture the semantic meanings of the texts, enabling effective similarity-based retrieval. DPR models, such as the the DPRContextEncoder and DPRContextEncoderTokenizer, are built on the BERT architecture but specialize in dense passage retrieval. They differ from BERT in their training, which focuses on contrastive learning for retrieving relevant passages, while BERT is more general-purpose, handling various NLP tasks. Passages are:
1. tokenize
2. encode
3. aggregate in a single vector

At the end of this section, the function `encode_contexts` does everything together.

Ignore the warnings:

In [5]:
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
# the mebdding dimension is 768
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

Usage example of the tokenizer:

In [6]:
# format of text must be [("sentence A", "sentence B"),(,),(,),...,(,)]
text_list = [("How are you?", "I am fine, I believe."), ("What's up?", "Not much.")]

tokens_info = context_tokenizer(text_list, return_tensors = 'pt', padding = True, truncation = True, max_length = 256)

print(f"\t\t\t\t\t\tResult of 'context_tokenizer': \n\n {tokens_info} \n\n", "-"*160)

# the size of the following three is torch.Size([len(text), max lenght of sentence A+B])
tokens_info['input_ids'].shape 
tokens_info['token_type_ids'].shape # these are the standard BERT-like segment IDs of each ()
tokens_info['attention_mask'].shape # 1 for actual tokens, 0 for the paddings added during padding = True

print(f"\t\t\t\t\tResult of 'context_tokenizer.convert_ids_to_tokens'\n\n")
for s in tokens_info['input_ids']: #get the original text
   print(f"{context_tokenizer.convert_ids_to_tokens(s)}")

						Result of 'context_tokenizer': 

 {'input_ids': tensor([[ 101, 2129, 2024, 2017, 1029,  102, 1045, 2572, 2986, 1010, 1045, 2903,
         1012,  102],
        [ 101, 2054, 1005, 1055, 2039, 1029,  102, 2025, 2172, 1012,  102,    0,
            0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])} 

 ----------------------------------------------------------------------------------------------------------------------------------------------------------------
					Result of 'context_tokenizer.convert_ids_to_tokens'


['[CLS]', 'how', 'are', 'you', '?', '[SEP]', 'i', 'am', 'fine', ',', 'i', 'believe', '.', '[SEP]']
['[CLS]', 'what', "'", 's', 'up', '?', '[SEP]', 'not', 'much', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]']


Usage example of the encoder:

`context_encoder(**tokens_info)`is equivalent to 
```
context_encoder(tokens_info['input_ids'],
                tokens_info['token_type_ids'],
                tokens_info['attention_mask'])
```

In [7]:
context_encoder(**tokens_info) #embedding

print(f"\t\t\t\t\tEmbedding tensors with shape {context_encoder(**tokens_info).pooler_output.shape}: \n\n {context_encoder(**tokens_info).pooler_output} \n ") # embedding PyTorch tensor without other informations

					Embedding tensors with shape torch.Size([2, 768]): 

 tensor([[ 0.1607,  0.7112, -0.0994,  ..., -0.3211,  0.6649, -0.0329],
        [ 0.6606,  0.3294,  0.3890,  ..., -0.0723,  0.3644, -0.1266]],
       grad_fn=<SliceBackward0>) 
 


Usage example of the preparation of the context encoding. Notice that if we do not write `[segments]` but only `segments`, the resulting shape would be [2,768] instead of the more correct [1,768]. This is because Hugging Face interprets `(,)` as two distinc samples, instead of the correct input-context.

In [8]:
text_list = [("How are you?", "I am fine, I believe."), ("What's up?", "Not much.")]
embeddings = []
for segments in text_list:

    input_of_encoder = context_tokenizer([segments], return_tensors = 'pt', padding = True, truncation = True, max_length = 256)
    embedding_tensor = context_encoder(**input_of_encoder).pooler_output # embedding_tensor.shape is torch.Size([1, 768])
    embeddings.append(embedding_tensor)


torch.cat(embeddings).detach().numpy() #this is the aggregation, with shape torch.Size([2, 768])

array([[ 0.16070484,  0.711192  , -0.0993585 , ..., -0.3211333 ,
         0.66492313, -0.03289769],
       [ 0.6606474 ,  0.32937145,  0.389035  , ..., -0.07225327,
         0.36436284, -0.1265702 ]], dtype=float32)

The previous cell but converted in a function:

In [9]:
def encode_contexts(text_list):
    embeddings = []
    for segments in text_list:
        input_of_encoder = context_tokenizer([segments], return_tensors = 'pt', padding = True, truncation = True, max_length = 256)
        embedding_tensor = context_encoder(**input_of_encoder).pooler_output
        embeddings.append(embedding_tensor) 
    return torch.cat(embeddings).detach().numpy() #aggregate

Usage example:

In [10]:
text_list = [("How are you?", "I am fine, I believe."), ("What's up?", "Not much.")]
encode_contexts(text_list)

array([[ 0.16070484,  0.711192  , -0.0993585 , ..., -0.3211333 ,
         0.66492313, -0.03289769],
       [ 0.6606474 ,  0.32937145,  0.389035  , ..., -0.07225327,
         0.36436284, -0.1265702 ]], dtype=float32)

So for the paragraphs in our original text:

In [11]:
# this takes 45 sec 

context_embeddings = encode_contexts(paragraphs)
context_embeddings.shape # (len(paragraphs) = 76, 768)

(76, 768)

## FAISS index / vector space

`vector_space = faiss.IndexFlatL2(d)` initializes a d-dimensional vector space, so that after we can add the vectorial representation that we have obtained with the `encode_contexts` function in the previous step. After we have filled out our `vector_space`, we can perform a similarity search (faiss uses squared L2 distance).

In [12]:
embedding_dim = 768  # This should match the dimension of the encoder dimension
vector_space = faiss.IndexFlatL2(embedding_dim)


context_embeddings_np = np.array(context_embeddings).astype('float32')
context_embeddings_np.shape # (len(paragraphs) = 76, 768)
vector_space.add(context_embeddings_np) 

If we want, we can also compute the maximum distance among the embedded context vectors. This can be useful to normalize the distances in the retrieval process:

In [13]:
from scipy.spatial.distance import pdist, squareform

dist_matrix = squareform(pdist(context_embeddings_np, metric = 'euclidean'))

max_distance = np.max(dist_matrix)
max_distance_squared = max_distance ** 2

print(f"Max squared L2 distance: {max_distance_squared:.4f}")

Max squared L2 distance: 209.3562


## Define the pre-trained  question encoder

In [14]:
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.weight', 'question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Retrival process

Now we have all the encoders and the distance metric (faiss), so now we can retrive. First, process an example query by converting the raw text question into a format that the DPR question encoder can understand and then encode it into a dense vector. Using the encoded question, search your prebuilt FAISS index to find the most relevant contexts. This step showcases the practical use of the FAISS index in retrieving information based on query similarity.

After conducting the search for relevant contexts based on the question embedding, the output consists of two key components:

- **D (Distances)**: This array contains the distances between the query embedding and the retrieved document embeddings. The distances measure the similarity between the query and each document, where lower distances indicate higher relevance. These values help determine how closely each retrieved context matches the query.

- **I (Indices)**: This array holds the indices of the paragraphs within the `paragraphs` array that have been identified as the most relevant to the query. These indices correspond to the positions of the paragraphs in the original data array, allowing for easy retrieval of the actual text content.

The combination of `D` and `I` provides both a quantitative measure of relevance and the specific content that is most relevant, enabling a comprehensive response to the user's query.

**Source of confusion**: even if in the following cell there is no direct mention to the context, it is already encapsulated in the faiss, which is called with `vector_space.search`

In [15]:
question = 'Drug and Alcohol Policy'
question_inputs = question_tokenizer(question, return_tensors = 'pt')
question_embedding = question_encoder(**question_inputs).pooler_output
question_embedding_np = question_embedding.detach().numpy()

# Search on the vector_space
number_of_top_results = 5
D, I = vector_space.search(question_embedding_np, k = number_of_top_results)  # Retrieve top 5 relevant contexts
print(f"Distances: {D[0]}")
print(f"Normalized distances (percentages): {np.round(D[0]/max_distance_squared*100,2)}%")
print(f"Indices: {I[0]}")

Distances: [72.76531  74.7162   84.388115 88.36438  90.287125]
Normalized distances (percentages): [34.76 35.69 40.31 42.21 43.13]%
Indices: [45 21  9 30 18]


In [16]:
print(f"\t\t\t\t\t\t\tTop {number_of_top_results} relevant contexts:")
for i, idx in enumerate(I[0]):
    print("-"*160)
    print(f"Result number {i+1} with distance  {np.round(D[0][i]/max_distance_squared*100,2)}%: \n {paragraphs[idx]}")

							Top 5 relevant contexts:
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Result number 1 with distance  34.76%: 
 6.	Drug and Alcohol Policy
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Result number 2 with distance  35.69%: 
 Policy Objective: The Drug and Alcohol Policy is established to establish clear expectations and guidelines for the responsible use of drugs and alcohol within the organization. This policy aims to maintain a safe, healthy, and productive workplace.
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Result number 3 with distance  40.31%: 
 Testing and Searches: The organization reserves the right to conduct drug and alcohol

Let's convert the above to a function:

In [17]:
def search_relevant_contexts(question, question_tokenizer, question_encoder, vector_space, k = 5, display = False):

    question_inputs = question_tokenizer(question, return_tensors='pt')
    question_embedding = question_encoder(**question_inputs).pooler_output
    question_embedding_np = question_embedding.detach().numpy()

    # Search the index to retrieve top k relevant contexts
    D, I = vector_space.search(question_embedding_np, k)
    if display == True:
        print(f"\t\t\t\t\t\t\tTop {number_of_top_results} relevant contexts:")
        for i, idx in enumerate(I[0]):
            print("-"*160)
            print(f"Result number {i+1} with distance  {np.round(D[0][i]/max_distance_squared*100,2)}%: \n {paragraphs[idx]}")

    return D, I

Usage example:

In [18]:
question = 'Drug and Alcohol Policy'
number_of_top_results = 5
D, I = search_relevant_contexts(question, question_tokenizer, question_encoder, vector_space, k = number_of_top_results, display = True)

							Top 5 relevant contexts:
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Result number 1 with distance  34.76%: 
 6.	Drug and Alcohol Policy
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Result number 2 with distance  35.69%: 
 Policy Objective: The Drug and Alcohol Policy is established to establish clear expectations and guidelines for the responsible use of drugs and alcohol within the organization. This policy aims to maintain a safe, healthy, and productive workplace.
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Result number 3 with distance  40.31%: 
 Testing and Searches: The organization reserves the right to conduct drug and alcohol

## From retrival to final answer: decoder

We will use GPT-2 but this is not the best: GPT-2 is trained to next-token prediction and not for Q-A based on context. Better choices could be "google/flan-t5-base", "facebook/bart-large", "allenai/t5-small-squad2" or "deepset/tinyroberta-squad2".

In [19]:
decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
decoder = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token
decoder.config.pad_token_id = decoder_tokenizer.pad_token_id

Now we build the final answer using the decoder. The function `generate_answer` will receive the question (the same question received by `search_relevant_contexts`) and the contexts coming from `search_relevant_contexts`.

In [20]:
# This is the one by IBM. It is not very precise, because it concatenates the input question and the top contexts
# It is faster and simple, but less precise

def generate_answer(question, contexts): 
    
    input_text = question + ' ' + ' '.join(contexts) # Concatenate the retrieved contexts to form the input to GPT2
    inputs = decoder_tokenizer(input_text, return_tensors = 'pt', max_length = 1024, truncation = True)

    summary_ids = decoder.generate(
                            inputs['input_ids'],
                            attention_mask = inputs['attention_mask'],  # ✅ Added!
                            max_new_tokens = 50, min_length = 40,
                            length_penalty = 2.0,
                            num_beams = 4,
                            early_stopping = True,
                            pad_token_id = decoder_tokenizer.pad_token_id)

    return decoder_tokenizer.decode(summary_ids[0], skip_special_tokens = True)

# More precise function, but slower

def my_generate_answer(question, contexts):
    candidates = []
    for context in contexts:
        input_text = question + '\n' + context
        inputs = decoder_tokenizer(input_text, return_tensors = 'pt', max_length = 512, truncation = True)
        summary_ids = decoder.generate(inputs['input_ids'],
                            attention_mask = inputs['attention_mask'],  # ✅ Added!
                            max_new_tokens = 50, min_length = 40,
                            length_penalty = 2.0,
                            num_beams = 4,
                            early_stopping = True,
                            pad_token_id = decoder_tokenizer.pad_token_id)
        response = decoder_tokenizer.decode(summary_ids[0], skip_special_tokens = True)
        candidates.append(response)
    
    # puoi implementare una metrica, oppure restituire tutti i risultati
    return candidates

In [21]:
# it takes 1m
question = "what is mobile policy?"

_, I = search_relevant_contexts(question, question_tokenizer, question_encoder, vector_space, k = 5, display = False)
top_contexts = [paragraphs[idx] for idx in I[0]] 

answer = generate_answer(question, top_contexts)
my_answers = my_generate_answer(question, top_contexts)
print(f"\t\t\t\t\t\t\tgenerate_answer Results:\n\n{answer}\n","-"*160)
print(f"\t\t\t\t\t\t\tmy_generate_answer Results:\n\n")
for i, my_answer in enumerate(my_answers):
    print(f"Result {i}: {my_answer}")



							generate_answer Results:

what is mobile policy? 4.	Mobile Phone Policy The Mobile Phone Policy sets forth the standards and expectations governing the appropriate and responsible usage of mobile devices in the organization. The purpose of this policy is to ensure that employees utilize mobile phones in a manner consistent with company values and legal compliance. Monitoring: The company retains the right to monitor internet and email usage for security and compliance purposes. Acceptable Use: Mobile devices are primarily intended for work-related tasks. Limited personal usage is allowed, provided it does not disrupt work obligations. The Mobile Phone Policy is aimed at promoting the responsible and secure use of mobile devices in line with legal and ethical standards. Every employee is expected to comprehend and abide by these guidelines. Regular reviews of the policy ensure its ongoing alignment with evolving technology and security best practices.

The Mobile Phone Policy set

# 2) (simplified) RAG with PyTorch 

Similarity-based retrieval of static answers using BERT mean embeddings.

In this chapter we want to understand if a song is ok for children, and for doing this we use a similarity-based retrival. We have:
1. pre-defined set of questions `song_questions`;
2. lyrics of a song as `sesame_street` and `my_shoe_lyrics`;
3. list of predefined answers, **not generated**, which is `yes_responses`.

**Very important**: notice the Q and A are associated, this is the reason for which the answers embedding will not be used.

## Dataset: questions and answers

Questions and answers:

In [22]:
song_questions = [
    "Does this song contain any violent themes, such as references to guns, killing, or physical aggression? Example: Does the song describe or promote physical violence, like fighting or shootings?",
    "Are there any explicit lyrics or bad words used in this song that might be considered offensive or inappropriate? Example: Does the song use language commonly recognized as profanity or derogatory terms?",
    "Is the overall content of this song suitable for children, considering its themes, language, and messages? Example: Are there elements in the song that could be deemed too mature or unsuitable for young listeners?",
    "Does this song explicitly mention weapons, such as guns, knives, or other similar items? Example: Are specific types of weapons described or glorified in the lyrics?",
    "Are the messages conveyed in this song positive and uplifting for children? Example: Does the song promote values like kindness, friendship, and positivity?",
    "Does this song include any sexual content, references to sexual behavior, or suggestive language? Example: Are there lyrics that explicitly or implicitly discuss sexual themes or experiences?",
    "Does this song offer any educational value, such as teaching the alphabet, basic math, or other learning content? Example: Are there educational segments in the song that could help children learn fundamental skills like the ABCs or counting?",
    "Does this song promote emotional resilience and social skills among children? Example: Does the song include themes of overcoming challenges or building friendships?"
]

yes_responses = [
    "Yes, this song contains violent themes, including references to guns, killing, or physical aggression, and is not suitable for children.",
    "Yes, this song includes explicit lyrics or bad words that might be considered offensive or inappropriate for young audiences.",
    "No, the overall content of this song is not suitable for children as it includes themes, language, and messages that are too mature or unsuitable for young listeners.",
    "Yes, this song explicitly mentions weapons, such as guns and knives, which could be disturbing or inappropriate for children’s entertainment.",
    "Yes, the messages conveyed in this song are positive and uplifting, promoting values like kindness, friendship, and positivity, beneficial for children.",
    "Yes, this song includes sexual content and references to sexual behavior or suggestive language, which are inappropriate for a child-friendly environment.",
    "Yes, this song offers significant educational value, including segments that teach the alphabet, basic math, and other learning content, making it both fun and educational for children.",
    "Yes, this song promotes emotional resilience and social skills, incorporating themes about overcoming challenges and building friendships, which are essential for children's development."
]

Song lyrics:

In [23]:
sesame_street_lyrics = """
Sunny day
Sweepin' the clouds away
On my way to where the air is sweet
Can you tell me how to get
How to get to Sesame Street?

Come and play
Everything's A-okay
Friendly neighbors there
That's where we meet
Can you tell me how to get
How to get to Sesame Street?

It's a magic carpet ride
Every door will open wide
To happy people like you
Happy people like
What a beautiful

Sunny day
Sweepin' the clouds away
On my way to where the air is sweet
Can you tell me how to get
How to get to Sesame Street?
How to get to Sesame Street?
How to get to Sesame Street?
How to get to Sesame Street?
How to get to Sesame Street?
"""

my_shoe_lyrics = """Barney is a dinosaur from our imagination
And when he's tall
He's what we call a dinosaur sensation
Barney's friends are big and small
They come from lots of places
After school they meet to play
And sing with happy faces
Barney shows us lots of things
Like how to play pretend
ABC's, and 123's
And how to be a friend
Barney comes to play with us
Whenever we may need him
Barney can be your friend too
If you just make-believe him!"""

In [24]:
def process_song(song):
    # Remove line breaks from the song
    song_new = re.sub(r'[\n]', ' ', song)
    
    # Remove single quotes from the song
    processed_song = [song_new.replace("\'", "")]
    
    return processed_song

sesame_street_lyrics = process_song(sesame_street_lyrics)
my_shoe_lyrics = process_song(my_shoe_lyrics)

## Tokenizer and Model (for both questions and context)

Why Use BERT Instead of DPR?
Because:

- The task is more semantic and classification-oriented:
The goal is to check whether a piece of text has certain attributes (e.g., violent content), not to retrieve the most relevant document from a large corpus.

- BERT is sufficient and more general-purpose for computing mean embeddings:
It captures general semantic information well for tasks like similarity and classification.

- DPR would be overkill for this use case:
It's optimized for retrieval scenarios involving millions of documents and queries, not for small-scale semantic matching.

- BERT is more stable on small datasets:
Since it's not fine-tuned for any specific retrieval task, it performs more consistently across varied inputs.

In [25]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

Usage example of tokenizer:

In [26]:
input_text = [("This is an example sentence for BERT embeddings.", "How do you like it "),("There are other models")]
tokens_info = bert_tokenizer(input_text,
                            add_special_tokens = True,
                            padding = True,
                            truncation = True,
                            return_tensors = 'pt')

print(f"Mask: {tokens_info['attention_mask']}")
text = bert_tokenizer.decode(tokens_info['input_ids'][0])
print(text)

word_embedding = bert_model(**tokens_info).pooler_output # shape torch.Size([2, 768])
token_embedding = bert_model(**tokens_info).last_hidden_state # shape torch.Size([2, 20, 768])
                                                              # 20 max len(tokens_info['input_ids'][i])

Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
[CLS] this is an example sentence for bert embeddings. [SEP] how do you like it [SEP]


The following function will not be used:

In [27]:
def text_to_emb(input_text, tokenizer = bert_tokenizer, model = bert_model, max_length_input = 512):
    tokens_info = tokenizer(input_text,
                        add_special_tokens = True,
                        padding = True,
                        truncation = True,
                        max_length = max_length_input,
                        return_tensors='pt')

    input_ids = tokens_info['input_ids'].to(device)
    attention_mask = tokens_info['attention_mask'].to(device)

    word_embedding = bert_model(input_ids, attention_mask).pooler_output
    return word_embedding  # shape: [len(list_of_text), 768]

Usage example:

In [28]:
input_text = [("This is an example sentence for BERT embeddings.", "How do you like it "),("There are other models")]

word_embedding = text_to_emb(input_text, bert_tokenizer, bert_model, 512)

word_embedding

tensor([[-0.8875, -0.4939, -0.8772,  ..., -0.8416, -0.7311,  0.8905],
        [-0.7748, -0.2365,  0.1868,  ...,  0.1140, -0.5093,  0.7607]],
       grad_fn=<TanhBackward0>)

## Aggregate with mean for retrieval

Here, you'll compute aggregated mean embeddings for input sequences using the BERT model you just loaded. It processes each pair of token IDs and attention masks from the input data, extracts word embeddings for non-padded tokens, and calculates their mean. The result is a list of mean embeddings for each sequence, which is then concatenated into a single tensor. This process allows for the generation of simplified yet informative representations of the input sequences, useful for tasks like clustering, similarity search, or as input to downstream models. Each document must be under 512 tokens.

Below, we define a function that does the same.

In [29]:
def aggregate_embeddings(input_text, tokenizer = bert_tokenizer, model = bert_model, max_length_input = 512):

    tokens_info = tokenizer(input_text,
                            add_special_tokens = True,
                            padding = True,
                            truncation = True,
                            max_length = max_length_input,
                            return_tensors = 'pt')
    
    input_ids = tokens_info['input_ids'].to(device) # shape: [len(input_text), max_seq_len]
    attention_mask = tokens_info['attention_mask'].to(device) # shape: [len(input_text), max_seq_len]

    with torch.no_grad():
        token_embedding = model(input_ids, attention_mask).last_hidden_state # shape: [len(input_text), max_seq_len, hidden_dim] 

    # Mask padding tokens
    attention_mask_expanded = attention_mask.unsqueeze(-1)  # shape: [len(input_text), max_seq_len, 1]
    masked_hidden = token_embedding * attention_mask_expanded # Hadamard product, shape: [len(input_text), max_seq_len, 1]
    lengths = attention_mask_expanded.sum(dim = 1)  # shape: [len(input_text), 1]

    # Calcola la media degli embedding validi
    mean_embeddings = masked_hidden.sum(dim=1) / lengths
    return mean_embeddings  # shape: [len(input_text), hidden_dim]

Usage example:

In [30]:
input_text = [("This is an example sentence for BERT embeddings.", "How do you like it "),("There are other models")]

mean_embeddings = aggregate_embeddings(input_text, bert_tokenizer, bert_model, 512)

mean_embeddings

tensor([[ 0.0122, -0.3505, -0.0670,  ..., -0.0368, -0.1658,  0.3197],
        [-0.1538, -0.1765,  0.1797,  ..., -0.1320,  0.3122,  0.0646]])

## Embeddings and answer retrival

**Very important**: notice the Q and A are associated, this is the reason for which the answers embedding will not be used. In general, if the answers are not associated 1-1 to the questions, or if for each question there are multiple answers, then it should be useful to compute all the 3 products (Q-A, Q-song, A-song), and combine the results.

In [31]:
embeddings_questions = aggregate_embeddings(song_questions) #torch.Size([8, 768])
# embeddings_responses = aggregate_embeddings(yes_responses) #torch.Size([8, 768]) --> not used

#songs
embeddings_sesame_street = aggregate_embeddings(sesame_street_lyrics) #torch.Size([1, 768])
embeddings_my_shoe = aggregate_embeddings(my_shoe_lyrics) #torch.Size([1, 768])

In [32]:
def semantic_QA_with_similarity(embeddings_questions, embeddings_songs, n_responses = 3, distance = "dot"):
    question_norms = torch.norm(embeddings_questions, dim=1, keepdim=True)
    response_norms = torch.norm(embeddings_songs, dim=1, keepdim=True)


    # Calculate the dot product between the question embeddings and the provided embeddings_songs (transpose of the second matrix for proper alignment).
    dot_product = embeddings_questions @ embeddings_songs.T

    
    # Calculate cosine similarity by dividing the dot product by the product of the magnitudes
    cosine_similarity = dot_product / (question_norms * response_norms)

    # Flatten the cosine similarity tensor to a 1D tensor for easier processing
    cosine_similarity = cosine_similarity.reshape(-1)
    
    # Reshape the dot product results to a 1D tensor for easier processing.
    dot_product = dot_product.reshape(-1)

    if distance == "dot":
        # Sort the indices of the dot product results in descending order (setting descending to False should be True for typical similarity tasks).
        sorted_indices = torch.argsort(dot_product, descending = True)

    if distance == "cosine":
        sorted_indices = torch.argsort(cosine_similarity, descending =True)

    # Convert sorted indices to a list for easier iteration.
    sorted_indices = sorted_indices.tolist()

    # Print the top 'n_responses' responses from the sorted list, which correspond to the highest dot product values.
    for index in sorted_indices[:n_responses]:
        print(yes_responses[index])

In [33]:
semantic_QA_with_similarity(embeddings_questions, embeddings_sesame_street, n_responses = 3, distance = 'dot')

Yes, the messages conveyed in this song are positive and uplifting, promoting values like kindness, friendship, and positivity, beneficial for children.
Yes, this song offers significant educational value, including segments that teach the alphabet, basic math, and other learning content, making it both fun and educational for children.
Yes, this song includes explicit lyrics or bad words that might be considered offensive or inappropriate for young audiences.
