 # Exercise 1: Tokenization with BERT

In [4]:
import torch
import transformers
from transformers import BertTokenizer

In [8]:
# Define the sentence
sentence = "This is a test sentence for tokenization"

In [21]:
# Load the BERT tokenizer
tokenizer_instance =  BertTokenizer.from_pretrained("bert-base-uncased")
sentence_tokenized = tokenizer_instance.tokenize(sentence)

In [27]:
# Print original and tokenized sentences
print("Original sentence:", sentence)
print("Tokenized sentence:", sentence_tokenized)

Original sentence: This is a test sentence for tokenization
Tokenized sentence: ['this', 'is', 'a', 'test', 'sentence', 'for', 'token', '##ization']


In [28]:
sentence_encoded = tokenizer_instance.encode_plus(sentence,
                                      add_special_tokens = True, # Add [CLS] and [SEP]
                                      padding = "max_length",  # Corrected spelling
                                      max_length = 15, # Limt sequence lenght
                                      return_tensors="pt") # convert to pytorch tensor

In [29]:
# Print encoded sentence (IDs)
print("\nEncoded sentence (token IDs):", sentence_encoded["input_ids"])


Encoded sentence (token IDs): tensor([[  101,  2023,  2003,  1037,  3231,  6251,  2005, 19204,  3989,   102,
             0,     0,     0,     0,     0]])


In [30]:
# Decode token IDs back into tokens to see special tokens added
decoded_tokens = tokenizer_instance.convert_ids_to_tokens(sentence_encoded["input_ids"][0])
print("\nDecoded Tokens:", decoded_tokens)


Decoded Tokens: ['[CLS]', 'this', 'is', 'a', 'test', 'sentence', 'for', 'token', '##ization', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


 # Exercise 2: Sentiment Analysis with BERT Pipeline

 Objective: Utilize a pre-trained BERT model for sentiment analysis.

In [32]:
from transformers import pipeline

In [38]:
# Sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

# Question answering pipeline, specifying the checkpoint identifier
pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Device set to use cpu


<transformers.pipelines.question_answering.QuestionAnsweringPipeline at 0x7d8b2fffff10>

In [34]:
text = "I really enjoyed this movie! It was fantastic."

In [39]:
#Get the sentiment of the text
result = sentiment_pipeline(text)

In [40]:
# Print the sentiment and confidence score
print("Predicted Sentiment:", result[0]["label"])
print("Confidence Score:", result[0]["score"])

Predicted Sentiment: POSITIVE
Confidence Score: 0.9998762607574463


 # Exercise 3: Building a Custom Sentiment Analyzer

 Objective: Develop a more robust sentiment analyzer with direct control over components.

In [41]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [63]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re

class BertSentimentAnalyzer:
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        """Initialize the sentiment analyzer with BERT model and tokenizer."""

        # Define the model name
        self.model_name = model_name

        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

        # Set device (use GPU if available)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # Move model to device

        # Define labels
        self.labels = ["Negative", "Positive"]

    def preprocess_text(self, text):
        """Preprocess the input text: clean, tokenize, pad, and convert to tensor."""

        # Step 1: Clean text (remove extra spaces)
        text = re.sub(r'\s+', ' ', text).strip()  # Fix: space removal should preserve words

        # Step 2: Tokenize using BERT tokenizer
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        # Step 3: Move inputs to the correct device (CPU/GPU)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        return inputs

    def predict_sentiment(self, text):
        """Predict the sentiment of the input text."""

        # Preprocess the text
        inputs = self.preprocess_text(text)

        # Get model prediction
        with torch.no_grad():  # Disable gradient calculation (faster inference)
            outputs = self.model(**inputs)

        # Apply Softmax to get probabilities
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)  # Fix: nn.functional.softmax()

        # Extract sentiment prediction
        confidence, predicted_class = torch.max(probabilities, dim=1)
        sentiment_label = self.labels[predicted_class.item()]

        # Convert probabilities to list
        probs = probabilities.squeeze().tolist()

        return {
            "text": text,
            "predicted_sentiment": sentiment_label,
            "confidence_score": confidence.item(),
            "probabilities": {"NEGATIVE": probs[0], "POSITIVE": probs[1]}
        }



In [64]:
# Create an instance of the sentiment analyzer
sentiment_analyzer = BertSentimentAnalyzer()

# Test with sample texts
sample_texts = [
    "I love this product! It's amazing.",
    "This is the worst experience I've ever had.",
    "I'm feeling okay about the situation, but not great."
]

# Run predictions
for text in sample_texts:
    result = sentiment_analyzer.predict_sentiment(text)
    print("\nText:", result["text"])
    print("Predicted Sentiment:", result["predicted_sentiment"])
    print("Confidence Score:", result["confidence_score"])
    print("Detailed Probabilities:", result["probabilities"])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


Text: I love this product! It's amazing.
Predicted Sentiment: Positive
Confidence Score: 0.9998866319656372
Detailed Probabilities: {'NEGATIVE': 0.00011340079072397202, 'POSITIVE': 0.9998866319656372}

Text: This is the worst experience I've ever had.
Predicted Sentiment: Negative
Confidence Score: 0.9997679591178894
Detailed Probabilities: {'NEGATIVE': 0.9997679591178894, 'POSITIVE': 0.00023206845798995346}

Text: I'm feeling okay about the situation, but not great.
Predicted Sentiment: Negative
Confidence Score: 0.9990149736404419
Detailed Probabilities: {'NEGATIVE': 0.9990149736404419, 'POSITIVE': 0.0009850439382717013}


 # Exercise 4: Understanding BERT for Named Entity Recognition (NER)

  Explore how BERT can be used for NER tasks and understand the B-I-O tagging scheme.

In [69]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

class BERTNamedEntityRecognizer:
    def __init__(self):
        """Initialize the Named Entity Recognizer with BERT model and tokenizer."""

        # Define model name (pre-trained on NER tasks)
        model_name = "dslim/bert-base-NER"

        # Load tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)

        # Set device (use GPU if available)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)  # Move model to the device

        # Define an inference pipeline for convenience
        self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer, device=0 if torch.cuda.is_available() else -1)

    def recognize_entities(self, text):
        """Recognize named entities in the given text using the BERT NER model."""

        # Get predictions from the pipeline
        ner_results = self.ner_pipeline(text)

        # Format results for readability
        entities = []
        for entity in ner_results:
            entities.append({
                "word": entity["word"],
                "entity": entity["entity"],
                "score": round(entity["score"], 4),  # Round confidence score
                "start": entity["start"],
                "end": entity["end"]
            })

        return entities


In [70]:
# Create an instance of the NER model
ner_recognizer = BERTNamedEntityRecognizer()

# Test with a sample text
sample_text = "Albert Einstein was a German physicist who developed the theory of relativity. He was born in Ulm, Germany, and worked at the Institute for Advanced Study in Princeton, USA."

# Run entity recognition
entities = ner_recognizer.recognize_entities(sample_text)

# Print detected named entities
print("\nNamed Entities Detected:")
for entity in entities:
    print(entity)


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu



Named Entities Detected:
{'word': 'Albert', 'entity': 'B-PER', 'score': 0.9995, 'start': 0, 'end': 6}
{'word': 'Einstein', 'entity': 'I-PER', 'score': 0.9983, 'start': 7, 'end': 15}
{'word': 'German', 'entity': 'B-MISC', 'score': 0.9997, 'start': 22, 'end': 28}
{'word': 'U', 'entity': 'B-LOC', 'score': 0.9989, 'start': 94, 'end': 95}
{'word': '##lm', 'entity': 'I-LOC', 'score': 0.9731, 'start': 95, 'end': 97}
{'word': 'Germany', 'entity': 'B-LOC', 'score': 0.9996, 'start': 99, 'end': 106}
{'word': 'Institute', 'entity': 'B-ORG', 'score': 0.9991, 'start': 126, 'end': 135}
{'word': 'for', 'entity': 'I-ORG', 'score': 0.9989, 'start': 136, 'end': 139}
{'word': 'Advanced', 'entity': 'I-ORG', 'score': 0.999, 'start': 140, 'end': 148}
{'word': 'Study', 'entity': 'I-ORG', 'score': 0.9989, 'start': 149, 'end': 154}
{'word': 'Princeton', 'entity': 'B-LOC', 'score': 0.9942, 'start': 158, 'end': 167}
{'word': 'USA', 'entity': 'B-LOC', 'score': 0.9995, 'start': 169, 'end': 172}


# Exercise 5: Comparing BERT and GPT

Objective: Differentiate between BERT and GPT models based on their architecture and purpose.

1. GPT and BERT use the Transformer architecture, a neural network architecture designed to learn contextual relationships between words in a text using attention mechanisms. The attention mechanism allows the model to focus on specific parts of the text that are more relevant in the context of a given task.

In [75]:
import pandas as pd

# Créer un dictionnaire avec les données
data = {
    "Feature": [
        "Architecture", "Primary Purpose", "Training Approach",
        "Directionality", "Common Use Cases", "Strengths", "Weaknesses"
    ],
    "BERT": [
        "Encoder-only (Bidirectional)", "Text understanding (NLP comprehension)",
        "Masked Language Model (MLM)", "Bidirectional (full sentence context)",
        "Sentiment Analysis, NER, Question Answering, Text Classification",
        "- Deep context understanding\n- Good for classification/extraction",
        "- Cannot generate text\n- Needs task-specific fine-tuning"
    ],
    "GPT": [
        "Decoder-only (Autoregressive)", "Text generation (NLP creation)",
        "Causal Language Model (CLM)", "Unidirectional (left-to-right)",
        "Chatbots, Story Generation, Summarization, Translation",
        "- Fluent text generation\n- Can create human-like responses",
        "- Less accurate in comprehension tasks\n- Can generate hallucinations"
    ]
}

# Convertir en DataFrame
df = pd.DataFrame(data)


In [79]:
df.head()

Unnamed: 0,Feature,BERT,GPT
0,Architecture,Encoder-only (Bidirectional),Decoder-only (Autoregressive)
1,Primary Purpose,Text understanding (NLP comprehension),Text generation (NLP creation)
2,Training Approach,Masked Language Model (MLM),Causal Language Model (CLM)
3,Directionality,Bidirectional (full sentence context),Unidirectional (left-to-right)
4,Common Use Cases,"Sentiment Analysis, NER, Question Answering, T...","Chatbots, Story Generation, Summarization, Tra..."


Analysis of Differences and Similarities

1. Similarities

  *  Based on Transformers: BERT and GPT use the Transformer architecture, but they exploit different components (Encoder vs. Decoder).
  *  Use Pre-training and Fine-tuning: Both models are pre-trained on large amounts of data and then fine-tuned for specific tasks.
  
2.  Major Differences

  * BERT is an analysis model, while GPT is a generation model.
  * BERT is bidirectional (understands the entire sentence), whereas GPT is unidirectional (predicts one word at a time).
  * BERT is used for text understanding and classification, whereas GPT is used for interactive text generation.

# Exercise 6: Exploring BERT Applications in Retrieval-Augmented Generation (RAG)

Objective: Understand how BERT is used in RAG systems for information retrieval.

2. Explain how BERT is used in the retrieval component of a RAG system.
3. Describe the role of BERT in generating embeddings for documents and queries.
4. Explain how a vector database is used to store and search for relevant documents based on BERT embeddings.

1. Research the concept of Retrieval-Augmented Generation (RAG) and its components.

Retrieval-Augmented Generation (RAG) is an AI framework that combines information retrieval and text generation to improve the accuracy of responses in NLP tasks. Instead of relying solely on a language model’s internal knowledge, RAG retrieves relevant documents from an external database and uses them to generate fact-based answers. The two key components of RAG are the retriever, which finds relevant documents, and the generator, which synthesizes a response based on retrieved information.

2.  How is BERT Used in the Retrieval Component of a RAG System?


BERT is primarily used as a retriever in RAG systems. It encodes both queries and documents into vector embeddings, capturing deep semantic relationships rather than relying on keyword matching. This allows the system to retrieve the most contextually relevant documents. BERT’s bidirectional nature helps better understand the query’s intent, improving retrieval accuracy in comparison to traditional search algorithms.

3. Role of BERT in Generating Embeddings for Documents and Queries

BERT generates high-dimensional vector embeddings that represent the meaning of words and sentences. When a user submits a query, BERT converts it into an embedding and compares it to precomputed embeddings of documents stored in a database. This semantic matching technique ensures that the most contextually relevant documents are retrieved, even if they don’t share exact keywords with the query.

4. How a Vector Database is Used in a RAG System

A vector database stores precomputed BERT embeddings of documents, allowing for efficient similarity searches. When a user submits a query, its embedding is compared to stored embeddings using similarity metrics (e.g., cosine similarity) to identify the most relevant documents. Tools like FAISS, Pinecone, or ChromaDB enable fast searches across large datasets, significantly improving retrieval speed and accuracy in RAG systems.



5. Provide an example of how BERT and an LLM like GPT can work together in a RAG system to answer a user’s question.

 Scenario: A User Asks a Question

A retriever (using BERT embeddings + FAISS) fetches the most relevant documents from a database.

A generator (GPT model) reads these documents and constructs a coherent answer.