#### Libraries

In [1]:
########################## UTILITY AND SYSTEM ##########################

import os                       # filesystem operations
import csv                      # reading/writing CSV files
import json                     # JSON parsing and serialization
import math                     # basic math functions
import random                   # random number generation
import time                     # time-related functions
import tempfile                 # temporary file management
import tarfile                  # tar archive handling
import io                       # input/output streams
import pickle                   # object serialization
import importlib                # dynamic import of modules
import multiprocessing          # parallel process management
import pkg_resources            # package and dependency management
from copy import deepcopy       # deep copy of objects
from pathlib import Path        # filesystem paths handling (cross-platform)

########################## DOWNLOAD ##########################

import requests                 # HTTP requests library
import wget                     # file downloads from URLs
from urllib.request import urlopen  # open URLs (alternative to requests)

########################## VISUALIZATION ##########################

import matplotlib.pyplot as plt # basic plotting library
import plotly.graph_objs as go  # interactive plotting
from tqdm.notebook import tqdm  # progress bars for loops in notebooks
from pprint import pprint       # formatted pretty-printing of objects

########################## DATAFRAME ##########################

import numpy as np              # numerical arrays and operations
import pandas as pd             # dataframes and data manipulation
from sklearn.manifold import TSNE

########################## TEXT PROCESSING ##########################

import re                      # regular expressions
import string                  # string constants and operations
from itertools import chain, islice  # advanced iteration and chaining

########################## TOKENIZATION ##########################

from collections import Counter, OrderedDict  # frequency counts and ordered dictionaries
import nltk                                   # natural language processing toolkit
from nltk.tokenize import word_tokenize       # word tokenization
import spacy                                  # advanced NLP (tokenization, parsing)
from torchtext.data.utils import get_tokenizer       # torchtext tokenizers
from torchtext.data.functional import to_map_style_dataset

from torchtext.vocab import build_vocab_from_iterator # build vocabulary from iterator

########################## DATASET AND DATALOADER ##########################

from torch.utils.data import Dataset, DataLoader, random_split   # datasets and data loading utilities
from torch.nn.utils.rnn import pad_sequence                      # padding variable-length sequences
from datasets import load_dataset, DatasetDict                   # HuggingFace datasets loading
from torchtext.datasets import AG_NEWS                           # torchtext built-in datasets

########################## PYTORCH AND DEEP LEARNING ##########################

import torch                             # PyTorch main library
from torch import nn, Tensor             # neural network modules and tensors
from torch.nn import CrossEntropyLoss    # common loss function for classification

########################## WORD EMBEDDING ##########################

from torchtext.vocab import GloVe        # pretrained GloVe embeddings
# from gensim.models import Word2Vec     # word2vec embeddings from corpus (commented out)

########################## HUGGING FACE ##########################

import transformers                      # transformers library core
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,     # GPT-2 tokenizer and model
    BertTokenizer, BertTokenizerFast, BertConfig, BertForMaskedLM,  # BERT components
    XLNetTokenizer,                     # XLNet tokenizer
    DistilBertForSequenceClassification, DistilBertTokenizer, AutoModelForSequenceClassification,
    pipeline,                          # easy pipelines for inference
    AutoTokenizer,                    # auto tokenizer loader
    AutoModelForCausalLM, GPT2ForSequenceClassification,
    DataCollatorForLanguageModeling, TrainingArguments, Trainer,  # training utilities
    set_seed, GenerationConfig,
    BertModel                        # BERT base model
)
from datasets import DatasetDict         # HuggingFace dataset dictionaries

######################### TRL & PEFT (TRAINING & PARAMETER EFFICIENT FINE-TUNING) ##########################

from trl import (
    SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM,
    DPOConfig, DPOTrainer,
    RewardTrainer, RewardConfig
)
from peft import get_peft_model, LoraConfig, TaskType
from torchmetrics import Accuracy        # metrics for evaluation

########################## RAG ##########################

from transformers import (
    DPRQuestionEncoder, DPRQuestionEncoderTokenizer,
    DPRContextEncoder, DPRContextEncoderTokenizer
)
import faiss                              # similarity search library

########################## EVALUATION ##########################

import evaluate



  import pkg_resources            # package and dependency management


In [2]:
def accelerator(where = "mps"):
    if where == "mps":
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        print("Which device we are on: {}".format(device))
        return device
    if where == "cuda":
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Which device we are on: {}".format(device))
        return device
    if where == "cpu":
        device = torch.device("cpu")
        print("Which device we are on: {}".format(device))
        return device

device = accelerator("cpu")

Which device we are on: cpu


<span style="background-color: yellow"></span>

# 0) CONCEPTS: RAG, FAISS, Prompt Engineering, LangChain

# 1) RAG with HuggingFace

The relevant context usually is in the form of type_of_file-content (es code_of_conduct-content)

In [3]:
filename = 'companyPolicies.txt'
url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/6JDbUb_L3egv_eOkouY71A.txt'

# Use wget to download the file
wget.download(url, out=filename)
print('file downloaded')

def read_and_split_text(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    # Split the text into paragraphs (simple split by newline characters)
    paragraphs = text.split('\n')
    # Filter out any empty paragraphs or undesired entries
    paragraphs = [para.strip() for para in paragraphs if len(para.strip()) > 0]
    return paragraphs

# Read the text file and split it into paragraphs

paragraphs = read_and_split_text('companyPolicies.txt')
random.shuffle(paragraphs) #shuffling samples so that the samples are not ordered based on the category they belong to
paragraphs[0]

file downloaded


'Treatment and Assistance: Employees with substance abuse issues are encouraged to seek help. The organization is committed to providing support, resources, and information to assist those seeking treatment.'

## Build the context encoder

Let's use the Dense Passage Retriever (DPR) model, specifically the context encoder, to convert your preprocessed text data into dense vector embeddings. These embeddings capture the semantic meanings of the texts, enabling effective similarity-based retrieval. DPR models, such as the the DPRContextEncoder and DPRContextEncoderTokenizer, are built on the BERT architecture but specialize in dense passage retrieval. They differ from BERT in their training, which focuses on contrastive learning for retrieving relevant passages, while BERT is more general-purpose, handling various NLP tasks. Passages are:
1. tokenize
2. encode
3. aggregate in a single vector

At the end of this section there is a function which does everything together

Ignore the warnings:

In [4]:
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', use_safetensors = True)
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', use_safetensors = True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Usage Example:

In [5]:
text = [("How are you?", "I am fine."), ("What's up?", "Not much.")]

tokens_info = context_tokenizer(text, return_tensors = 'pt', padding = True, truncation = True, max_length = 256)

print(tokens_info,'\n')

for s in tokens_info['input_ids']: #get the original text
   print(context_tokenizer.convert_ids_to_tokens(s))

context_encoder(**tokens_info) #embedding
print(context_encoder(**tokens_info).pooler_output,'\n') #embedding PyTorch tensor without other informations

embeddings=[]
for text in text:
    inputs = context_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256)
    outputs = context_encoder(**inputs)
    embeddings.append(outputs.pooler_output)
    print("number of samples:")
    print(len(embeddings))
    print(" samples shape:")
    print(outputs.pooler_output.shape) #the shape is [2,768]: 2 is len(text[i]) while 768 is the embedding dimension

torch.cat(embeddings).detach().numpy().shape #this is the aggregation

  return forward_call(*args, **kwargs)


{'input_ids': tensor([[ 101, 2129, 2024, 2017, 1029,  102, 1045, 2572, 2986, 1012,  102],
        [ 101, 2054, 1005, 1055, 2039, 1029,  102, 2025, 2172, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} 

['[CLS]', 'how', 'are', 'you', '?', '[SEP]', 'i', 'am', 'fine', '.', '[SEP]']
['[CLS]', 'what', "'", 's', 'up', '?', '[SEP]', 'not', 'much', '.', '[SEP]']
tensor([[ 0.1901,  0.6006, -0.1140,  ..., -0.3477,  0.6554,  0.0928],
        [ 0.6606,  0.3294,  0.3890,  ..., -0.0723,  0.3644, -0.1266]],
       grad_fn=<SliceBackward0>) 

number of samples:
1
 samples shape:
torch.Size([2, 768])
number of samples:
2
 samples shape:
torch.Size([2, 768])


(4, 768)

In [6]:
def encode_contexts(text_list):
    embeddings = []
    for text in text_list:
        inputs = context_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=256) #tokenized input
        outputs = context_encoder(**inputs)
        embeddings.append(outputs.pooler_output) #embedding vectors in
    return torch.cat(embeddings).detach().numpy() #aggregate

Example of usage:

In [7]:
encode_contexts(text)

array([[ 0.70083404,  0.5377584 ,  0.5114269 , ..., -0.32141268,
         0.4081831 , -0.07802014],
       [ 0.53100616, -0.2688355 , -0.08287751, ...,  0.24389295,
         0.11508092,  0.06219082]], dtype=float32)

So for the paragraphs in our original text:

In [8]:
context_embeddings = encode_contexts(paragraphs)

## FAISS index

In [9]:
embedding_dim = 768  # This should match the dimension of your embeddings
context_embeddings_np = np.array(context_embeddings).astype('float32')

index = faiss.IndexFlatL2(embedding_dim)
index.add(context_embeddings_np)  # Add the context embeddings to the index

## Build the question encoder

In [10]:
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', use_safetensors = True)
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base', use_safetensors = True)

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

## Retrival process

Now we have all the encoders and the distance metric (faiss), so now we can retrive. First, process an example query by converting the raw text question into a format that the DPR question encoder can understand and then encode it into a dense vector. Using the encoded question, search your prebuilt FAISS index to find the most relevant contexts. This step showcases the practical use of the FAISS index in retrieving information based on query similarity.

After conducting the search for relevant contexts based on the question embedding, the output consists of two key components:

- **D (Distances)**: This array contains the distances between the query embedding and the retrieved document embeddings. The distances measure the similarity between the query and each document, where lower distances indicate higher relevance. These values help determine how closely each retrieved context matches the query.

- **I (Indices)**: This array holds the indices of the paragraphs within the `paragraphs` array that have been identified as the most relevant to the query. These indices correspond to the positions of the paragraphs in the original data array, allowing for easy retrieval of the actual text content.

The combination of `D` and `I` provides both a quantitative measure of relevance and the specific content that is most relevant, enabling a comprehensive response to the user's query.

**Source of confusion**: even if in the following cell there is no direct mention to the context, it is already encapsulated in the faiss, which is called with `index.search`

In [11]:
question = 'Drug and Alcohol Policy'
question_inputs = question_tokenizer(question, return_tensors='pt')
question_embedding = question_encoder(**question_inputs).pooler_output.detach().numpy()

# Search the index
D, I = index.search(question_embedding, k=5)  # Retrieve top 5 relevant contexts
print("D:",D)
print("I:",I)

D: [[72.76533 74.71622 84.3881  88.36439 90.28713]]
I: [[ 3 33 55 29 40]]


In [12]:
print("Top 5 relevant contexts:")
for i, idx in enumerate(I[0]):
    print(f"{i+1}: {paragraphs[idx]}")
    print(f"distance {D[0][i]}\n")

Top 5 relevant contexts:
1: 6.	Drug and Alcohol Policy
distance 72.76532745361328

2: Policy Objective: The Drug and Alcohol Policy is established to establish clear expectations and guidelines for the responsible use of drugs and alcohol within the organization. This policy aims to maintain a safe, healthy, and productive workplace.
distance 74.71621704101562

3: Testing and Searches: The organization reserves the right to conduct drug and alcohol testing as per applicable laws and regulations. Employees may be subject to testing in cases of reasonable suspicion, post-accident, or as part of routine workplace safety measures.
distance 84.38809967041016

4: 9.	Discipline and Termination Policy
distance 88.36438751220703

5: Monitoring: The company retains the right to monitor internet and email usage for security and compliance purposes.
distance 90.2871322631836



Let's convert the above to a function:

In [13]:
def search_relevant_contexts(question, question_tokenizer, question_encoder, index, k = 5):
    # Tokenize the question
    question_inputs = question_tokenizer(question, return_tensors='pt')

    # Encode the question to get the embedding
    question_embedding = question_encoder(**question_inputs).pooler_output.detach().numpy()

    # Search the index to retrieve top k relevant contexts
    D, I = index.search(question_embedding, k)

    return D, I

## From retrival to final answer: decoder

In [14]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
model.generation_config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Now we compare the answer from the very same question: the first answer comes directly from the decoder (without context), the second uses the retrival process we have built:

In [15]:
def generate_answer_without_context(question):
    # Tokenize the input question
    inputs = tokenizer(question, return_tensors = 'pt', max_length = 1024, truncation = True)
    
    # Generate output directly from the question without additional context
    summary_ids = model.generate(inputs['input_ids'], max_length = 150, min_length = 40, length_penalty = 2.0,
                                 num_beams = 4, early_stopping = True, pad_token_id = tokenizer.eos_token_id)
    
    # Decode and return the generated text
    answer = tokenizer.decode(summary_ids[0], skip_special_tokens = True)
    return answer

def generate_answer(question, contexts):
    # Concatenate the retrieved contexts to form the input to GPT2
    input_text = question + ' ' + ' '.join(contexts)
    inputs = tokenizer(input_text, return_tensors = 'pt', max_length = 1024, truncation = True)

    # Generate output using GPT2
    summary_ids = model.generate(inputs['input_ids'], max_new_tokens = 50, min_length = 40, length_penalty = 2.0,
                                 num_beams = 4, early_stopping = True, pad_token_id = tokenizer.eos_token_id)
    return tokenizer.decode(summary_ids[0], skip_special_tokens = True)

In [16]:
question = "what is mobile policy?"
answer = generate_answer_without_context(question)

print("Answer:", answer)

_, I = search_relevant_contexts(question, question_tokenizer, question_encoder, index, k=5)
top_contexts = [paragraphs[idx] for idx in I[0]] 
answer = generate_answer(question, top_contexts)
print("Answer:", answer)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Answer: what is mobile policy?

Mobile policy is a set of rules and regulations that govern the use of mobile phones and other electronic devices in the United States. Mobile policy is a set of rules and regulations that govern the use of mobile phones and other electronic devices in the United States. Mobile policy is a set of rules and regulations that govern the use of mobile phones and other electronic devices in the United States. Mobile policy is a set of rules and regulations that govern the use of mobile phones and other electronic devices in the United States. Mobile policy is a set of rules and regulations that govern the use of mobile phones and other electronic devices in the United States. Mobile policy is a set of rules and regulations that govern the use of mobile phones and other
Answer: what is mobile policy? 4.	Mobile Phone Policy The Mobile Phone Policy sets forth the standards and expectations governing the appropriate and responsible usage of mobile devices in the 

# 2) RAG with PyTorch (Similarity-based retrieval of static answers using BERT mean embeddings)

In this notebook 3) we want to understand if a song is ok for children, and for doing this we use a similarity-based retrival. We have:
1. pre-defined set of questions `song_questions`;
2. lyrics of a song as `sesame_street` and `my_shoe_lyrics`;
3. list of predefined answers, not generated, which is `yes_responses`.

**Very important**: notice the Q and A are associated, this is the reason for which the answers embedding will not be used.

In [17]:
song_questions = [
    "Does this song contain any violent themes, such as references to guns, killing, or physical aggression? Example: Does the song describe or promote physical violence, like fighting or shootings?",
    "Are there any explicit lyrics or bad words used in this song that might be considered offensive or inappropriate? Example: Does the song use language commonly recognized as profanity or derogatory terms?",
    "Is the overall content of this song suitable for children, considering its themes, language, and messages? Example: Are there elements in the song that could be deemed too mature or unsuitable for young listeners?",
    "Does this song explicitly mention weapons, such as guns, knives, or other similar items? Example: Are specific types of weapons described or glorified in the lyrics?",
    "Are the messages conveyed in this song positive and uplifting for children? Example: Does the song promote values like kindness, friendship, and positivity?",
    "Does this song include any sexual content, references to sexual behavior, or suggestive language? Example: Are there lyrics that explicitly or implicitly discuss sexual themes or experiences?",
    "Does this song offer any educational value, such as teaching the alphabet, basic math, or other learning content? Example: Are there educational segments in the song that could help children learn fundamental skills like the ABCs or counting?",
    "Does this song promote emotional resilience and social skills among children? Example: Does the song include themes of overcoming challenges or building friendships?"
]

sesame_street = """
Sunny day
Sweepin' the clouds away
On my way to where the air is sweet
Can you tell me how to get
How to get to Sesame Street?

Come and play
Everything's A-okay
Friendly neighbors there
That's where we meet
Can you tell me how to get
How to get to Sesame Street?

It's a magic carpet ride
Every door will open wide
To happy people like you
Happy people like
What a beautiful

Sunny day
Sweepin' the clouds away
On my way to where the air is sweet
Can you tell me how to get
How to get to Sesame Street?
How to get to Sesame Street?
How to get to Sesame Street?
How to get to Sesame Street?
How to get to Sesame Street?
"""

my_shoe_lyrics="""Barney is a dinosaur from our imagination
And when he's tall
He's what we call a dinosaur sensation
Barney's friends are big and small
They come from lots of places
After school they meet to play
And sing with happy faces
Barney shows us lots of things
Like how to play pretend
ABC's, and 123's
And how to be a friend
Barney comes to play with us
Whenever we may need him
Barney can be your friend too
If you just make-believe him!"""

yes_responses = [
    "Yes, this song contains violent themes, including references to guns, killing, or physical aggression, and is not suitable for children.",
    "Yes, this song includes explicit lyrics or bad words that might be considered offensive or inappropriate for young audiences.",
    "No, the overall content of this song is not suitable for children as it includes themes, language, and messages that are too mature or unsuitable for young listeners.",
    "Yes, this song explicitly mentions weapons, such as guns and knives, which could be disturbing or inappropriate for children’s entertainment.",
    "Yes, the messages conveyed in this song are positive and uplifting, promoting values like kindness, friendship, and positivity, beneficial for children.",
    "Yes, this song includes sexual content and references to sexual behavior or suggestive language, which are inappropriate for a child-friendly environment.",
    "Yes, this song offers significant educational value, including segments that teach the alphabet, basic math, and other learning content, making it both fun and educational for children.",
    "Yes, this song promotes emotional resilience and social skills, incorporating themes about overcoming challenges and building friendships, which are essential for children's development."
]

In [18]:
def process_song(song):
    # Remove line breaks from the song
    song_new = re.sub(r'[\n]', ' ', song)
    
    # Remove single quotes from the song
    processed_song = [song_new.replace("\'", "")]
    
    return processed_song

sesame_street= process_song(sesame_street)
my_shoe_lyrics= process_song(my_shoe_lyrics)

## Tokenizer and Model (for both questions and context)

Why Use BERT Instead of DPR?
Because:

- The task is more semantic and classification-oriented:
The goal is to check whether a piece of text has certain attributes (e.g., violent content), not to retrieve the most relevant document from a large corpus.

- BERT is sufficient and more general-purpose for computing mean embeddings:
It captures general semantic information well for tasks like similarity and classification.

- DPR would be overkill for this use case:
It's optimized for retrieval scenarios involving millions of documents and queries, not for small-scale semantic matching.

- BERT is more stable on small datasets:
Since it's not fine-tuned for any specific retrieval task, it performs more consistently across varied inputs.

`batch_encode_plus` method is used for tokenizing text. It automatically handles padding and truncation to ensure uniformity in input length, which is crucial for batch processing in models like BERT. `attention_mask`: Identifies which tokens should be focused on, differentiating real content from padding.

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

Usage example:

In [20]:
input_text = [("This is an example sentence for BERT embeddings.", "How do you like it "),("There are other models")]
input_ids = tokenizer.batch_encode_plus(input_text,add_special_tokens=True,padding=True,truncation=True)
print(input_ids['attention_mask'])
text=tokenizer.decode(input_ids['input_ids'][0])
print(text)

input_ids_tensors = torch.tensor(input_ids['input_ids']).to(device)
mask_tensors = torch.tensor(input_ids['attention_mask']).to(device)
word_embding = bert_model(input_ids_tensors,mask_tensors)

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[CLS] this is an example sentence for bert embeddings. [SEP] how do you like it [SEP]


## Aggregate with mean for retrival

Here, you'll compute aggregated mean embeddings for input sequences using the BERT model you just loaded. It processes each pair of token IDs and attention masks from the input data, extracts word embeddings for non-padded tokens, and calculates their mean. The result is a list of mean embeddings for each sequence, which is then concatenated into a single tensor. This process allows for the generation of simplified yet informative representations of the input sequences, useful for tasks like clustering, similarity search, or as input to downstream models. Each document must be under 512 tokens.

Importantly, the output of this function, as for the case 2) with Hugging face has dimension [2,768].

Below, we define a function that does the same.

In [21]:
# Initialize a list to store the mean embeddings for each input sequence
aggregated_mean_embeddings = []

# Loop over each pair of input_ids and attention_masks
for token_ids, attention_mask in tqdm(zip(input_ids['input_ids'], input_ids['attention_mask'])):
    # Convert list of token ids and attention mask to tensors
    token_ids_tensor = torch.tensor([token_ids]).to(device)
    attention_mask_tensor = torch.tensor([attention_mask]).to(device)
    print("token_ids_tensor shape:",token_ids_tensor.shape, attention_mask_tensor.shape)  # Print the shapes of the input tensors
    with torch.no_grad():  # Disable gradient calculations for faster execution
        # Retrieve the batch of word embeddings from the BERT model
        embeddings = bert_model(token_ids_tensor, attention_mask=attention_mask_tensor)[0].squeeze(0)
        print("Word embeddings shape:", embeddings.shape)
        
        # Count and print the number of zero-padding embeddings
        num_zero_paddings = (attention_mask_tensor == 0).sum().item()
        print("Number of zero padding embeddings:", num_zero_paddings)
        
        # Create a mask for positions that are not zero-padded
        valid_embeddings_mask = attention_mask_tensor[0] != 0
        print("valid_embeddings_mask:",valid_embeddings_mask)
        
        # Filter out the embeddings corresponding to zero-padded positions
        filtered_embeddings = embeddings[valid_embeddings_mask, :]
        print("Word embeddings after zero padding embeddings removed:", filtered_embeddings.shape)
        
        # Compute the mean of the filtered embeddings
        mean_embedding = filtered_embeddings.mean(axis=0)
        print("Mean embedding shape:", mean_embedding.shape)
    
        # Append the mean embedding to the list, adding a batch dimension
        aggregated_mean_embeddings.append(mean_embedding.unsqueeze(0))

# Concatenate all mean embeddings to form a single tensor
aggregated_mean_embeddings = torch.cat(aggregated_mean_embeddings)
print('All mean embeddings shape:', aggregated_mean_embeddings.shape)

0it [00:00, ?it/s]

token_ids_tensor shape: torch.Size([1, 20]) torch.Size([1, 20])
Word embeddings shape: torch.Size([20, 768])
Number of zero padding embeddings: 0
valid_embeddings_mask: tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True])
Word embeddings after zero padding embeddings removed: torch.Size([20, 768])
Mean embedding shape: torch.Size([768])
token_ids_tensor shape: torch.Size([1, 20]) torch.Size([1, 20])
Word embeddings shape: torch.Size([20, 768])
Number of zero padding embeddings: 14
valid_embeddings_mask: tensor([ True,  True,  True,  True,  True,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False])
Word embeddings after zero padding embeddings removed: torch.Size([6, 768])
Mean embedding shape: torch.Size([768])
All mean embeddings shape: torch.Size([2, 768])


In [22]:
def aggregate_embeddings(input_ids, attention_masks, bert_model=bert_model):
    mean_embeddings = []
    # Process each sequence in the batch
    print('number of inputs',len(input_ids))
    for input_id, mask in tqdm(zip(input_ids, attention_masks)):
        input_ids_tensor = torch.tensor([input_id]).to(device)
        mask_tensor = torch.tensor([mask]).to(device)

        with torch.no_grad():
            # Obtain the word embeddings from the BERT model
            word_embeddings = bert_model(input_ids_tensor, attention_mask=mask_tensor)[0].squeeze(0)

            # Filter out the embeddings at positions where the mask is zero 
            valid_embeddings_mask=mask_tensor[0] != 0 
            valid_embeddings = word_embeddings[valid_embeddings_mask,:]
            # Compute the mean of the filtered embeddings
            mean_embedding = valid_embeddings.mean(dim=0)
            mean_embeddings.append(mean_embedding.unsqueeze(0))

    # Concatenate the mean embeddings from all sequences in the batch
    aggregated_mean_embeddings = torch.cat(mean_embeddings)
    return aggregated_mean_embeddings

In [23]:
def text_to_emb(list_of_text, max_input = 512):
    data_token_index  = tokenizer.batch_encode_plus(list_of_text, add_special_tokens=True,padding=True,truncation=True,max_length=max_input)
    question_embeddings=aggregate_embeddings(data_token_index['input_ids'], data_token_index['attention_mask'])
    return question_embeddings

## Embeddings and answer retrival

**Very important**: notice the Q and A are associated, this is the reason for which the answers embedding will not be used. In general, if the answers are not associated 1-1 to the questions, or if for each question there are multiple answers, then it should be useful to compute all the 3 products (Q-A, Q-song, A-song), and combine the results.

In [24]:
embeddings_questions = text_to_emb(song_questions) #torch.Size([8, 768])
# embeddings_responses = text_to_emb(yes_responses) #torch.Size([8, 768]) --> not used

#songs
embeddings_sesame_street = text_to_emb(sesame_street) #torch.Size([1, 768])
embeddings_my_shoe = text_to_emb(my_shoe_lyrics) #torch.Size([1, 768])

number of inputs 8


0it [00:00, ?it/s]

number of inputs 1


0it [00:00, ?it/s]

number of inputs 1


0it [00:00, ?it/s]

In [25]:
def RAG_QA(embeddings_questions, embeddings, n_responses = 3, distance = "dot"):
    question_norms = torch.norm(embeddings_questions, dim=1, keepdim=True)
    response_norms = torch.norm(embeddings, dim=1, keepdim=True)


    # Calculate the dot product between the question embeddings and the provided embeddings (transpose of the second matrix for proper alignment).
    dot_product = embeddings_questions @ embeddings.T

    
    # Calculate cosine similarity by dividing the dot product by the product of the magnitudes
    cosine_similarity = dot_product / (question_norms * response_norms)

    # Flatten the cosine similarity tensor to a 1D tensor for easier processing
    cosine_similarity = cosine_similarity.reshape(-1)
    
    # Reshape the dot product results to a 1D tensor for easier processing.
    dot_product = dot_product.reshape(-1)

    if distance == "dot":
        # Sort the indices of the dot product results in descending order (setting descending to False should be True for typical similarity tasks).
        sorted_indices = torch.argsort(dot_product, descending = True)

    if distance == "cosine":
        sorted_indices = torch.argsort(cosine_similarity, descending =True)

    # Convert sorted indices to a list for easier iteration.
    sorted_indices = sorted_indices.tolist()

    # Print the top 'n_responses' responses from the sorted list, which correspond to the highest dot product values.
    for index in sorted_indices[:n_responses]:
        print(yes_responses[index])

In [26]:
RAG_QA(embeddings_questions, embeddings_sesame_street, n_responses = 3, distance = 'dot')

Yes, the messages conveyed in this song are positive and uplifting, promoting values like kindness, friendship, and positivity, beneficial for children.
Yes, this song offers significant educational value, including segments that teach the alphabet, basic math, and other learning content, making it both fun and educational for children.
Yes, this song includes explicit lyrics or bad words that might be considered offensive or inappropriate for young audiences.


## Retrival Augmented Generation (RAG) and Facebook AI Similarity (FAISS)

**RAG** is a framework that helps optimize the output of LLMs without re-training the model, and by using an internal database of a company for example. To do this, RAG comprises two main components:
1. **The retriver**: the retriver combine:
    1. **encoded prompt**: a high-dimensional vectorial representation of the prompt (which is translated in a vector using a **question encoder**). The question encoder, at it ends, does an average;
    2. **relevant context**: an 'internal' database (obtained using a **context encoder** from internal documents of the company). ;

   The retrival combine the relevant context and the prompt matching similar vectors in the embedded spaces. As vector similarities, we can use the dot product for the magnitude and the cosine similarity for the direction.

2. **The generator**: using the data from the retriver, it answers to the user using a **decoder** (use `BartForConditionalGeneration` and `BartTokenizer`)

For the context encoder use `DPRContextEncoderTokenizer` from `transformers`, which reads list of tuples, and `DPRContextEncoder` .

For the question encoder use `DPRQuestionEncoderTokenizer` and `DPRQuestionEncoder`

Library for compute the distance importing `faiss`. 

## In-context learning and prompt engineering

**In-context learning** is a method of doing prompt engineering, and in particular we give to the model demonstration of the task provided.
- Advantages: 
    1. Does not require fine-tuning --> reduce time
    2. Improve performances
- Disadvantages:
    1. Limited to what fit in-context (what example can I realistically include in the prompt?)
    2. Complex tasks may require gradient steps and adjustments based on gradients
**Prompt Engineering**: prompts are divided in instructions and context (necessary background to do the task). PI is about how to ask a LLM questions in the best way possible. It is crucial to:
1. One-shot prompt: give one example of i.e. translation before asking to translate a sentence;
2. Few-shot prompts: giving some example of sentiment analysis before asking a new one;
3. Chain of thought: give an example and break it into steps for the solution to be effective;
4. Self consistency: 'when I was 6 my sister was half my age. Now I am 50, what age is my sister? Provide N independent calculations and explanations, then determine the most consistent result'. The model at the end choose the most frequent answer.

Where test the prompts? 
1. Playground;
2. LangChain: uses prompt templates, which include few shot examples.
3. HuggingFace;
4. IBM AI classroom;

## LangChain (chain of commands!)

Components of LangChain:

1. Language model: foundation of LLMs, using IBM, OpenAI, Google and Meta as primary language models
2. Chat model: efficient conversation
3. Chat message: efficient messages
4. Prompt templates: translate user questions into clear instructions
5. Output parser: transforms the output in suitable structured data


We can use LangChain Documents to use RAG, and also to build applications (unifying chains, or sequence of calls!).

In recent years, the development of Large Language Models (LLMs) like GPT-3 and GPT-4 has revolutionized the field of natural language processing (NLP). These models are capable of performing a wide range of tasks, from generating coherent text to answering questions and summarizing information. Their effectiveness, however, is not without limitations. One significant constraint is the context window length, which affects how much information can be processed at once. LLMs operate within a fixed context window, measured in tokens, with GPT-3 having a limit of 4096 tokens and GPT-4 extending to 8192 tokens. When dealing with lengthy documents, attempting to input the entire text into the model's prompt can lead to truncation, where essential information is lost, and increased computational costs due to the processing of large inputs.

These limitations become particularly pronounced when creating a retrieval-based question-answering (QA) assistant. The context length constraint restricts the ability to input all content into the prompt simultaneously, leading to potential loss of critical context and details. This necessitates the development of sophisticated strategies for selectively retrieving and processing relevant sections of the document. Techniques such as chunking the document into manageable parts, employing summarization methods, and using external retrieval systems are crucial to address these challenges. Understanding and mitigating these limitations are essential for designing effective QA systems that leverage the full potential of LLMs while navigating their inherent constraints.