In [7]:
import os
import re

from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

from langsmith import traceable

from ragas import EvaluationDataset
import json
import pandas as pd
import faiss

# openAI embeddings
from langchain_openai.embeddings import OpenAIEmbeddings

# vector store
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

# load api keys
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
#load document
file = "../documents/2025-26_iihf_rulebook.pdf"
loader = PyPDFLoader(file)
docs = loader.load()

#drop TOC and appendix
docs_cropped = docs[15:160]

# sticth docs back together
parts = []
for i, d in enumerate(docs_cropped, start=1):
    parts.append(f"\n\n<<<PAGE {i}>>>\n{d.page_content.strip()}")

merged_text = "".join(parts)

# Create a Document and update metadata
merged_doc = [Document(
    page_content=merged_text,
    metadata={
        "source": "IIHF Rulebook 2025-26",
        "page_count": len(docs_cropped),
    }
)]


In [3]:
#print(merged_doc)

In [4]:
# for spltting
MAIN_RE = re.compile(r"RULE[ \u00A0]+(?P<main_id>\d{1,3})[ \u00A0]+(?P<main_name>[A-Z-'´’–/”“]{3,}+(?:[ \u00A0][A-Z-'´’–/”“]+)*)")
SUB_RE = re.compile(r"(?P<sub_id>\d{1,3}\.\d{1,2})[.\u00A0 ]*[ \u00A0]+(?P<sub_name>[A-Z-'´’–/”“]{2,}+(?:[ \u00A0][A-Z-'´’–/”“0-9]+)*)")


def normalize_ocr(text):
    # collapse weird spaces (regular + non-breaking)
    text = re.sub(r"[ \u00A0]+", " ", text)

    # fix error for 'penalty' which somtimes is 'penal ty' etc
    text = re.sub(r"\bPENAL\s*TY\b", "PENALTY", text)
    text = re.sub(r"\bPENAL\s*TIES\b", "PENALTIES", text)
    text = re.sub(r"\bAL\s*TERNATE\b", "ALTERNATE", text)
    
    return text


# add rule id and name to metadata
def add_rule_metadata(m, prefix):
    metadata = {
        f"{prefix}_rule": m.group(0),
        f"{prefix}_rule_id": m.group(1),
        f"{prefix}_rule_name": m.group(2),
    }
    return metadata
        




# split on rules and subrules
def slice_on_regex(docs: list[Document], pattern, prefix):
    out = []
    rx = re.compile(pattern)

    for d in docs:
        text = normalize_ocr(d.page_content).lstrip()
        matches = list(rx.finditer(text))
        
        # returns original document if no matches.
        if not matches:
            out.append(d)
            continue
        
        # For each regex match, slice from its start up to the next match (or end of text),
        # so each chunk corresponds to one rule section with its content. 
        for i, m in enumerate(matches):
            if i + 1 < len(matches):
                end = matches[i+1].start()  
            else:
                end = len(text)
            chunk = text[m.start():end]
            
            # get new rule metadata and update metadata for the chunk
            rule_metadata = add_rule_metadata(m, prefix)
            new_meta = d.metadata.copy()
            new_meta.update(rule_metadata)
            
            out.append(Document(page_content=chunk, metadata=new_meta))

    return out






In [22]:
main_split = slice_on_regex(merged_doc, MAIN_RE, prefix='main')
sub_split = slice_on_regex(main_split, SUB_RE, prefix='sub')


#embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embedding_dim = len(embeddings.embed_query("dim"))

# vectorstore
vs_index = faiss.IndexFlatL2(embedding_dim)
vectorstore = FAISS(
    embedding_function=embeddings,
    index=vs_index,
    docstore=InMemoryDocstore({}),
    index_to_docstore_id={}
)

# parent store
parent_store = InMemoryStore()

# child splitter (splits subrules into smalle chunks)
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 120
)

# saves the parent references to the smaller child chunks for retrivial.
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=parent_store,
    child_splitter=child_splitter,
    search_kwargs={"k": 8}
)


retriever.add_documents(sub_split)
vectorstore.save_local("../vectorstore")

In [23]:
# Generation step

# formatting for llm context
def format_docs(docs):
    parts = []
    for doc in docs:
        meta = {
            "main_rule_id": doc.metadata.get("main_rule_id", "N/A"),
            "main_rule_name": doc.metadata.get("main_rule_name", "N/A"),
            "sub_rule_id": doc.metadata.get("sub_rule_id", "N/A"),
            "sub_rule_name": doc.metadata.get("sub_rule_name", "N/A"),
            "source": doc.metadata.get("source", "N/A")
        }
        parts.append(f"Metadata: {meta}\n{doc.page_content.strip()}")
    return "\n\n".join(parts)


# defineing the system template
system_template = """You are an ice hockey rule assistant.

Follow these rules:
- Answer ONLY using the provided context below. If the answer is unknown or not in the context, say "I don't know".
- Use bulletpoints. After each bullet, include a citation using the metadata main rule and sub rule


Rule citation format:
[<sub_rule_id> <main_rule_name> - <sub_rule_name>]


Context (use only what is inside the markers):
---
{context}
---"""

# defining the prompt template
prompt_template = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("user", "Original question: {question}"),
])


# retriever
#retriever = vectorstore.as_retriever(search_kwargs={"k": 6})



# llm
llm = init_chat_model("gpt-4o-mini", model_provider="openai", temperature=0)




multi_query_system_template = """You are an AI language model assistant. Your task is
    to generate 3 different versions of the given user
    question to retrieve relevant documents from a vector database.
    By generating multiple perspectives on the user question,
    your goal is to help the user overcome some of the limitations
    of distance-based similarity search. Provide these alternative
    questions separated by newlines. The questions are all about ice hockey.
"""

multi_query_prompt_template = ChatPromptTemplate.from_messages([
    ("system", multi_query_system_template),
    ("user", "{question}")
])


multi_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=llm, include_original=True, prompt=multi_query_prompt_template)


rag_chain = (
    prompt_template
    | llm
    | StrOutputParser()
)

@traceable
def rag_bot(question: str):
    # retrieve
    docs = multi_retriever.invoke(question)
    
    # build context and format docs
    context = format_docs(docs)
    
    # call rag_chain
    answer = rag_chain.invoke({"question": question, "context": context})
    
    return {"answer": answer, "documents": docs}

In [31]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [29]:
# rag test
#result = rag_bot("If the puck is shot before the red line, is touched after the red line and the goalie freezes it, can the defensive team change?")
result = rag_bot("Can i score a goal with my nose?")

In [30]:
print(result.get('answer'))
#result.get('documents')

- A goal cannot be scored when the puck has been directed with any part of the body (excluding skates) into the net by an attacking Player other than with a stick. If it is deemed to be done deliberately, then the decision shall be NO GOAL. [78.5 GOALS - DISALLOWED GOALS]


In [None]:
#- No, the defensive team shall not be permitted to make any Player substitutions prior to the “face-off” in this situation. <63.1 DELAYING THE GAME> 
#- However, they can substitute a Player to replace an injured Player or when a penalty has been assessed which affects the “on-ice strength” of either team. <82.1 LINE CHANGE>

In [26]:
# setup data for ragas testing

# load file
test_questions = []
with open("eval_questions.jsonl", "r") as f:
    for line in f:
        test_questions.append(json.loads(line))
             
dataset = []
for example in test_questions:
    # extract from eval_questions
    id = example.get('id')
    question = example.get('question')
    ground_truth = example.get('ground truth')
    inner_rule = example.get('inner rule')
    
    # run the ragbot and get output answer
    rag_bot_output = rag_bot(question)
    answer = rag_bot_output.get('answer')
    
    # get context
    docs = rag_bot_output.get('documents')
    contexts = [doc.page_content for doc in docs]
    
    dataset.append({
        "user_input": question,
        "retrieved_contexts": contexts,
        "response": answer,
        "reference": ground_truth,
    })

    


In [27]:
from ragas import EvaluationDataset, evaluate
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

evaluation_dataset = EvaluationDataset.from_list(dataset)
result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
    llm=llm)
result

Evaluating:   0%|          | 0/63 [00:00<?, ?it/s]

{'context_recall': 0.8571, 'faithfulness': 0.8413, 'factual_correctness(mode=f1)': 0.7129}

In [None]:
{'context_recall': 0.8571, 'faithfulness': 0.8571, 'factual_correctness(mode=f1)': 0.6186} # single query with top 4 docs retrieved
{'context_recall': 0.8571, 'faithfulness': 0.8608, 'factual_correctness(mode=f1)': 0.6614} # single query with top 6 docs retrieved
{'context_recall': 0.8571, 'faithfulness': 0.7579, 'factual_correctness(mode=f1)': 0.6886} # single query with top 8 docs retrieved
{'context_recall': 0.8571, 'faithfulness': 0.7540, 'factual_correctness(mode=f1)': 0.6400} # multi query with top 4 docs retrieved 

{'context_recall': 0.8571, 'faithfulness': 0.8413, 'factual_correctness(mode=f1)': 0.7129} # multi query, parentDocRetriever, top 8 docs, new chunking method
{'context_recall': 0.8571, 'faithfulness': 0.9083, 'factual_correctness(mode=f1)': 0.5881} # single query, parentDocRetriever, top 8 docs, new chunking method
