In [56]:
! pip install -q -U langchain langchain-community langchain-core
! pip install -q -U langchain-google-genai
! pip install -q -U langchain-qdrant langchain-huggingface qdrant-client

# **Part 01 Data Processing:**

In [3]:
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as genai



In [4]:
# List of URLs for Formula 1 data
F1_URLS = [
    "https://en.wikipedia.org/wiki/Formula_One",
    "https://www.formula1.com/en/latest/all",
    "https://www.forbes.com/sites/brettknight/2023/11/29/formula-1s-highest-paid-drivers-2023/?sh=12bdb942463f",
    "https://www.autosport.com/f1/news/history-of-female-f1-drivers-including-grand-prix-starters-and-test-drivers/10584871/",
    "https://en.wikipedia.org/wiki/2023_Formula_One_World_Championship",
    "https://en.wikipedia.org/wiki/2022_Formula_One_World_Championship",
    "https://en.wikipedia.org/wiki/List_of_Formula_One_World_Drivers_Champions",
    "https://en.wikipedia.org/wiki/2024_Formula_One_World_Championship",
    "https://www.formula1.com/en/results.html/2024/races.html",
    "https://www.formula1.com/en/racing/2024.html",
]

In [5]:
# Load all the Web Pages
def load_documents(urls):
    """Loads documents from a list of URLs."""
    documents = []
    for url in urls:
        try:
            loader = WebBaseLoader(web_path=url)
            documents.extend(loader.load())
        except Exception as e:
            print(f"Error loading URL {url}: {e}")
    return documents

documents = load_documents(F1_URLS)
print(f"Loaded {len(documents)} documents from the folder.")

Loaded 10 documents from the folder.


In [6]:
# Splits this docs into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Splits documents into smaller chunks."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return splitter.split_documents(documents)

splits = split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")
print(splits)


Split the documents into 764 chunks.


In [7]:
# Create Embeddings of Splited Chunks

from google.colab import userdata
from langchain_google_genai import GoogleGenerativeAIEmbeddings

google_api_key = userdata.get('GOOGLE_API_KEY')

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key = google_api_key)
document_embeddings = embeddings.embed_documents([split.page_content for split in splits])

print(f"Created embeddings for {len(document_embeddings)} document chunks.")
print(document_embeddings[0])

Created embeddings for 764 document chunks.
[0.007917269133031368, -0.08173193037509918, -0.08270405977964401, -0.03033973090350628, 0.06464509665966034, 0.010346879251301289, 0.040835585445165634, -0.007492202799767256, 0.004414689261466265, 0.032761190086603165, 0.014121844433248043, 0.020291652530431747, 0.009965235367417336, 0.013625736348330975, -0.037469372153282166, -0.01456529088318348, -0.012319916859269142, 0.00753260962665081, -0.044514965265989304, -0.05203144997358322, -0.009901804849505424, -0.0038192588835954666, -0.01097399927675724, 0.0009307469008490443, -0.0017920704558491707, -0.0453893318772316, -0.007304938975721598, -0.04263836890459061, 0.014059865847229958, 0.015230563469231129, -0.056148283183574677, 0.03578706458210945, 0.0009425699827261269, 0.006455218885093927, 0.01059637125581503, -0.05288814753293991, 0.018626417964696884, 0.09817661345005035, -0.01681940257549286, 0.08958519995212555, -0.03145492821931839, -0.04616164788603783, -0.007061547599732876, 0.

In [8]:
# Setup Quadrant Vetor Database

from qdrant_client import QdrantClient
from langchain_community.vectorstores import Qdrant
from qdrant_client.http.models import Distance, VectorParams

qudrant_url = userdata.get('QDRANT_URL')
qudrant_api_key = userdata.get('QDRANT_API_KEY')


def initialize_qdrant_collection(client, collection_name, vector_size):
    """Initializes or recreates a Qdrant collection."""
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size, distance=Distance.COSINE)
    )
    print(f"Collection '{collection_name}' initialized successfully.")

qdrant_client = QdrantClient(url=qudrant_url, api_key=qudrant_api_key)

collection = initialize_qdrant_collection(client=qdrant_client, collection_name="f1_gpt", vector_size=384)

  client.recreate_collection(


Collection 'f1_gpt' initialized successfully.


In [9]:
# Create Embedding and Add this embedding to Vector Store

from langchain_qdrant import QdrantVectorStore
from langchain_huggingface import HuggingFaceEmbeddings

qudrant_url = userdata.get('QDRANT_URL')
qudrant_api_key = userdata.get('QDRANT_API_KEY')

def embed_and_store_documents(documents, collection_name):
    """Generates embeddings for documents and stores them in Qdrant."""

    embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

    QdrantVectorStore.from_documents(
        documents=documents,
        embedding=embeddings,
        url=qudrant_url,
        api_key=qudrant_api_key,
        collection_name=collection_name
    )
    print("Documents successfully stored in Qdrant.")

store_create = embed_and_store_documents(splits, "f1_gpt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Documents successfully stored in Qdrant.


# **Part 02 Perform Retrieval and Generation:**

In [25]:
# use Vector store as a retevier
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from google.colab import userdata

qudrant_url = userdata.get('QDRANT_URL')
qudrant_api_key = userdata.get('QDRANT_API_KEY')
google_api_key = userdata.get('GOOGLE_API_KEY')

# Assuming 'embeddings' was defined earlier with GoogleGenerativeAIEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en-v1.5')

vector_store = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    url=qudrant_url,
    api_key=qudrant_api_key,
    collection_name="f1_gpt",
    retrieval_mode=RetrievalMode.DENSE,
)

query = "Who is the highest-paid F1 driver?"
found_docs = vector_store.similarity_search(query,k=5)

print(found_docs)

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Formula_One', 'title': 'Formula One - Wikipedia', 'language': 'en', '_id': 'e48966e9-128b-47e5-baee-2b2dd5b78dba', '_collection_name': 'f1_gpt'}, page_content='A number of Formula One drivers earn the highest salary of any drivers in auto racing. The highest-paid driver in 2021 is Lewis Hamilton, who received $55\xa0million in salary from Mercedes AMG Petronas F1 – a record for any driver.[227] The very top Formula One drivers get paid more than IndyCar or NASCAR drivers; however, the earnings immediately fall off after the top three F1 drivers, and the majority of NASCAR racers will make more money than their F1 counterparts.[228] Most top IndyCar drivers are paid around a tenth of their Formula One counterparts.[227]\nIn the second quarter of 2020, Formula One reported a loss revenue of $122\xa0million and an income of $24\xa0million. This was a result of the delay of the racing championship start as a result of the COVID-1

In [19]:
print(found_docs[0].metadata)
print(found_docs[0].metadata['source'])
print(found_docs[0].metadata['title'])
print(found_docs[0].metadata['_collection_name'])

{'source': 'https://en.wikipedia.org/wiki/Formula_One', 'title': 'Formula One - Wikipedia', 'language': 'en', '_id': 'e48966e9-128b-47e5-baee-2b2dd5b78dba', '_collection_name': 'f1_gpt'}
https://en.wikipedia.org/wiki/Formula_One
Formula One - Wikipedia
f1_gpt


In [21]:
print(found_docs[0].page_content)

A number of Formula One drivers earn the highest salary of any drivers in auto racing. The highest-paid driver in 2021 is Lewis Hamilton, who received $55 million in salary from Mercedes AMG Petronas F1 – a record for any driver.[227] The very top Formula One drivers get paid more than IndyCar or NASCAR drivers; however, the earnings immediately fall off after the top three F1 drivers, and the majority of NASCAR racers will make more money than their F1 counterparts.[228] Most top IndyCar drivers are paid around a tenth of their Formula One counterparts.[227]
In the second quarter of 2020, Formula One reported a loss revenue of $122 million and an income of $24 million. This was a result of the delay of the racing championship start as a result of the COVID-19 pandemic. The company grossed revenues of $620 million for the same quarter the previous year.[229]


In [24]:
print(found_docs[1].page_content)
print(found_docs[1].metadata.get('source'))
print(found_docs[1].metadata.get('title'))

Formula 1’s Highest-Paid Drivers 2023Subscribe To NewslettersBETATHIS IS A BETA EXPERIENCE. OPT-OUT HEREForbesBusinessSportsMoneyEdit StoryDaily CoverFormula 1’s Highest-Paid Drivers 2023Grid Ironmen: Max Verstappen and Lewis Hamilton are F1's two top earners for the third straight year... [+]Grid Ironmen: Max Verstappen and Lewis Hamilton are F1's two top earners for the third straight year. [-] Minas Panagiotakis/GettyBrett KnightForbes StaffBrett Knight is an assistant managing editor covering sports business.FollowingNov 29, 2023,02:00am ESTVirtually unbeatable on the racetrack, Max Verstappen is top of the paycheck podium as well, leading a group of 10 drivers who collectively made $258 million this year.
By Brett Knight, Forbes Staff
https://www.forbes.com/sites/brettknight/2023/11/29/formula-1s-highest-paid-drivers-2023/?sh=12bdb942463f
Formula 1’s Highest-Paid Drivers 2023


In [29]:
# Creating a Retriever

retriever = vector_store.as_retriever(search_kwargs={"k": 2})
retriever_results = retriever.invoke("Who is the highest-paid F1 driver?")

print(retriever_results)

page_content='Formula 1’s Highest-Paid Drivers 2023Subscribe To NewslettersBETATHIS IS A BETA EXPERIENCE. OPT-OUT HEREForbesBusinessSportsMoneyEdit StoryDaily CoverFormula 1’s Highest-Paid Drivers 2023Grid Ironmen: Max Verstappen and Lewis Hamilton are F1's two top earners for the third straight year... [+]Grid Ironmen: Max Verstappen and Lewis Hamilton are F1's two top earners for the third straight year. [-] Minas Panagiotakis/GettyBrett KnightForbes StaffBrett Knight is an assistant managing editor covering sports business.FollowingNov 29, 2023,02:00am ESTVirtually unbeatable on the racetrack, Max Verstappen is top of the paycheck podium as well, leading a group of 10 drivers who collectively made $258 million this year.
By Brett Knight, Forbes Staff' metadata={'source': 'https://www.forbes.com/sites/brettknight/2023/11/29/formula-1s-highest-paid-drivers-2023/?sh=12bdb942463f', 'title': 'Formula 1’s Highest-Paid Drivers 2023', 'description': 'Virtually unbeatable on the racetrack, M

## **Building the RAG Chain**

In [39]:
prompt_template = """
You are an AI Assistant who knows everything about Formula One.
Use the blow context to augment what you know about formula one racing.
The context will provide you with the most resent pages data from wikipedia, the official F1 Website and other.
If the context dosen't include the information you need answer based on your existing knowledge and don't mention the source of your information or what the context does or dosen't include.
Format responses using markdown where applicable and don't return images.
-------
START CONTEXT
{context}
END CONTEXT
-------
QUESTION: {question}
ANSWER:
"""

In [41]:
from google.colab import userdata

google_api_key = userdata.get('GOOGLE_API_KEY')

# =================== Initialize Google GenAI LLM ===================

from langchain_google_genai import ChatGoogleGenerativeAI

llm:ChatGoogleGenerativeAI = ChatGoogleGenerativeAI(api_key=google_api_key, model="gemini-2.0-flash-exp")

In [42]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_template(prompt_template)
rag_chain = (

    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [54]:
# Using the RAG Chain

# question = "What is Formula 1, and how is it different from other motorsports?"
# question = "How many teams and drivers currently participate in an F1 season?"
# question = "What is the role of a pit stop in a Formula 1 race?"
# question = "Who is the youngest F1 driver to ever win a Grand Prix?"
# question = "Who is the highest-paid F1 driver?"
question = "What is DRS (Drag Reduction System), and how does it work in F1?"

In [55]:
response = rag_chain.invoke(question)
print(f"Question: {question}")
print(f"Answer: {response}")

Question: What is DRS (Drag Reduction System), and how does it work in F1?
Answer: The Drag Reduction System (DRS) is an adjustable aerodynamic device on Formula One cars. It allows drivers to reduce drag and increase top speed, aiding in overtaking. When activated, the DRS opens a flap in the rear wing, reducing downforce and drag.

In 2024, the rules for DRS usage were adjusted slightly. Drivers are now allowed to use DRS one lap after a race start, safety car restart, or red flag restart, which is one lap earlier than in previous seasons. This change was tested during the sprints of 2023.
