In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

import os
import tempfile
import uuid
import pandas as pd
import re
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [11]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 13, 'total_tokens': 34, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_0392822090', 'id': 'chatcmpl-BPWgHQ6iQNzyIwIj3c0kk8PMLcG9i', 'finish_reason': 'stop', 'logprobs': None}, id='run-1f1bda49-cc75-47ef-9fb2-d9642a452423-0', usage_metadata={'input_tokens': 13, 'output_tokens': 21, 'total_tokens': 34, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## 1. Process PDF

The pdf is loaded and split in pages, this pages are split then in chunks of size n. The chunks store content information and some metadata such as page or document of origin


In [12]:
file_path = "../documents/FIFA MEMO QA.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()
document_name = os.path.basename(file_path)
for page in pages:
    page.metadata["document_name"] = document_name

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, 
                                               chunk_overlap=200, 
                                               length_function=len, separators = ["\n\n", "\n", ""])
chunks = text_splitter.split_documents(pages)

In [14]:
chunks[:5]

[Document(metadata={'producer': 'Mac OS X 10.10.5 Quartz PDFContext', 'creator': 'Word', 'creationdate': '2016-06-27T12:13:07+00:00', 'aapl:keywords': '[]', 'keywords': '', 'moddate': '2017-03-21T14:43:23-04:00', 'title': 'Microsoft Word - IFAB_LoG_FAQ_v4.0 (clean).docx', 'source': '../documents/FIFA MEMO QA.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1', 'document_name': 'FIFA MEMO QA.pdf'}, page_content='The International Football Association Board Münstergasse 9    8001 Zurich    Switzerland  T: +41 (0)44 245 1886    F: +41 (0)44 245 1887    theifab.com \n1/12'),
 Document(metadata={'producer': 'Mac OS X 10.10.5 Quartz PDFContext', 'creator': 'Word', 'creationdate': '2016-06-27T12:13:07+00:00', 'aapl:keywords': '[]', 'keywords': '', 'moddate': '2017-03-21T14:43:23-04:00', 'title': 'Microsoft Word - IFAB_LoG_FAQ_v4.0 (clean).docx', 'source': '../documents/FIFA MEMO QA.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1', 'document_name': 'FIFA MEMO QA.pdf'}, page_content='Rev

In [None]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0
    
    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{document_name}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [9]:
calculate_chunk_ids(chunks)

[Document(metadata={'producer': 'Mac OS X 10.10.5 Quartz PDFContext', 'creator': 'Word', 'creationdate': '2016-06-27T12:13:07+00:00', 'aapl:keywords': '[]', 'keywords': '', 'moddate': '2017-03-21T14:43:23-04:00', 'title': 'Microsoft Word - IFAB_LoG_FAQ_v4.0 (clean).docx', 'source': '../documents/FIFA MEMO QA.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1', 'id': '../documents/FIFA MEMO QA.pdf:0:0'}, page_content='The International Football Association Board Münstergasse 9    8001 Zurich    Switzerland  T: +41 (0)44 245 1886    F: +41 (0)44 245 1887    theifab.com \n1/12'),
 Document(metadata={'producer': 'Mac OS X 10.10.5 Quartz PDFContext', 'creator': 'Word', 'creationdate': '2016-06-27T12:13:07+00:00', 'aapl:keywords': '[]', 'keywords': '', 'moddate': '2017-03-21T14:43:23-04:00', 'title': 'Microsoft Word - IFAB_LoG_FAQ_v4.0 (clean).docx', 'source': '../documents/FIFA MEMO QA.pdf', 'total_pages': 12, 'page': 0, 'page_label': '1', 'id': '../documents/FIFA MEMO QA.pdf:0:1'}, page_

In [10]:
len(chunks)

47

## 1.1 Store chunks in a Chroma db

Using OpenAI embeddings function and local db from Chroma. The chunks are identified by an id so that we avoid uploading the same chunk twice to the database.

In [11]:
def get_embeddings():
    embeddings = OpenAIEmbeddings( model= "text-embedding-ada-002", openai_api_key = OPENAI_API_KEY)
    return embeddings
embedding_function = get_embeddings()
#test_vector = embedding_function.embed_query("cat")

In [12]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator = "embedding_distance", embeddings = embedding_function)
evaluator.evaluate_strings(prediction="Amsterdam", reference = "coffeeshop")

{'score': 0.1745443723078154}

In [13]:
def add_to_chroma(chunks: list, vectorstore_path: str):
    # Load the existing database.
    db = Chroma(
        persist_directory= vectorstore_path, embedding_function=get_embeddings()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [14]:
# Create vectorstore
add_to_chroma(chunks, "vectorstore_chroma")

  db = Chroma(


Number of existing documents in DB: 334
✅ No new documents to add


## 2. Access to database

Accessing Chroma db by creating a variable db and set a retriever. The retriever will perform a cosine similarity on the input text with the database and deliver the k (default 5) chunks more similar to the input.

In [None]:
db = Chroma(persist_directory="vectorstore_chroma", embedding_function=get_embeddings())

In [27]:
## Query relevant data
retriever = db.as_retriever(seach_type="similarity")
relevant_chunks = retriever.invoke("A defender touches the ball with his hand in his own penalty area, however the ball was deflected by another player. Should this be ruled as a penalty?")
sources = [doc.metadata.get("id", None) for doc in relevant_chunks]

In [28]:
sources

[]

## 3. Prompt creation

The prompt is a copy of Than Vu used prompt in her video RAG elements. Defines *instructions* for the model to not make up any piece of information. Forming the answer from a *context* which would be the k most relevant pieces of information from the pdf and a *question* which will be the user's input.

In [18]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

In [19]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="A defender touches the ball with his hand in his own penalty area, however the ball was deflected by another player. Should this be ruled as a penalty?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

player outside the penalty area.  If the goalkeeper handles the ball inside their 
penalty area when not permitted to do so, an indirect free kick is awarded but 
there is no disciplinary sanction. However, if the offence is playing the ball a 
second time (with or without the hand/arm) after a restart before it touches 
another player, the goalkeeper must be sanctioned if the offence stops a 
promising attack or denies an opponent or the opposing team a goal or an 
obvious goal-scoring opportunity.
11
No handball
Handball
1111
No handball
Handball
Handball

---

Where a player denies the opposing team a goal or an obvious goal-scoring 
opportunity by committing a non-deliberate handball offence and the referee 
awards a penalty kick, the offender is cautioned.

---

128
If, after

In [20]:
llm.invoke(prompt)

AIMessage(content="Based on the context provided, if the defender's handball was non-deliberate and it does not deny an obvious goal-scoring opportunity, it would not necessarily result in a penalty kick being awarded. However, if the referee determines that the handball meets the criteria for a penalty due to denying a goal or opportunity, a penalty may be awarded. Since the situation involves a deflection by another player, a penalty may not be given, but it ultimately depends on the referee's interpretation of the circumstances. Therefore, the decision on whether it should be ruled as a penalty is not straightforward and would depend on additional factors.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 126, 'prompt_tokens': 580, 'total_tokens': 706, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cach

In [21]:
sources

['../documents/Laws_of_the Game_2024_25.pdf:105:1',
 '../documents/Laws_of_the Game_2024_25.pdf:163:1',
 '../documents/Laws_of_the Game_2024_25.pdf:127:0',
 '../documents/Laws_of_the Game_2024_25.pdf:107:1']

In [22]:
relevant_chunks

[Document(metadata={'creationdate': '2024-05-22T13:59:41+02:00', 'creator': 'Adobe InDesign 19.4 (Windows)', 'id': '../documents/Laws_of_the Game_2024_25.pdf:105:1', 'moddate': '2024-05-22T14:00:15+02:00', 'page': 105, 'page_label': '106', 'producer': 'Adobe PDF Library 17.0', 'source': '../documents/Laws_of_the Game_2024_25.pdf', 'total_pages': 230, 'trapped': '/False'}, page_content='player outside the penalty area.  If the goalkeeper handles the ball inside their \npenalty area when not permitted to do so, an indirect free kick is awarded but \nthere is no disciplinary sanction. However, if the offence is playing the ball a \nsecond time (with or without the hand/arm) after a restart before it touches \nanother player, the goalkeeper must be sanctioned if the offence stops a \npromising attack or denies an opponent or the opposing team a goal or an \nobvious goal-scoring opportunity.\n11\nNo handball\nHandball\n1111\nNo handball\nHandball\nHandball'),
 Document(metadata={'creationda

In [23]:
relevant_chunks[0].page_content

'player outside the penalty area.  If the goalkeeper handles the ball inside their \npenalty area when not permitted to do so, an indirect free kick is awarded but \nthere is no disciplinary sanction. However, if the offence is playing the ball a \nsecond time (with or without the hand/arm) after a restart before it touches \nanother player, the goalkeeper must be sanctioned if the offence stops a \npromising attack or denies an opponent or the opposing team a goal or an \nobvious goal-scoring opportunity.\n11\nNo handball\nHandball\n1111\nNo handball\nHandball\nHandball'

In [24]:
relevant_chunks

[Document(metadata={'creationdate': '2024-05-22T13:59:41+02:00', 'creator': 'Adobe InDesign 19.4 (Windows)', 'id': '../documents/Laws_of_the Game_2024_25.pdf:105:1', 'moddate': '2024-05-22T14:00:15+02:00', 'page': 105, 'page_label': '106', 'producer': 'Adobe PDF Library 17.0', 'source': '../documents/Laws_of_the Game_2024_25.pdf', 'total_pages': 230, 'trapped': '/False'}, page_content='player outside the penalty area.  If the goalkeeper handles the ball inside their \npenalty area when not permitted to do so, an indirect free kick is awarded but \nthere is no disciplinary sanction. However, if the offence is playing the ball a \nsecond time (with or without the hand/arm) after a restart before it touches \nanother player, the goalkeeper must be sanctioned if the offence stops a \npromising attack or denies an opponent or the opposing team a goal or an \nobvious goal-scoring opportunity.\n11\nNo handball\nHandball\n1111\nNo handball\nHandball\nHandball'),
 Document(metadata={'creationda

This auxiliary function will store in a dataframe information from the k most relevant chunks. This information will entail document name, in case there are multiple documents in the db, document page where to find the given information, and content of the chunk.

In [25]:
def process_relevant_chunks(relevant_chunks):
    data = []
    
    for doc in relevant_chunks:
        metadata = doc.metadata
        data.append({
            "document_name": metadata.get("source", "").split("/")[-1],  # Extract filename
            "page": metadata.get("page", ""),  # Page number
            "content": doc.page_content.strip()  # Clean text content
        })
    
    return pd.DataFrame(data)

process_relevant_chunks(relevant_chunks)

Unnamed: 0,document_name,page,content
0,Laws_of_the Game_2024_25.pdf,105,player outside the penalty area. If the goalk...
1,Laws_of_the Game_2024_25.pdf,163,Where a player denies the opposing team a goal...
2,Laws_of_the Game_2024_25.pdf,127,"128\nIf, after the penalty kick has been taken..."
3,Laws_of_the Game_2024_25.pdf,107,or attempted to kick the ball to release it in...
