In [306]:
import os
import re

from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# openAI embeddings
from langchain_openai.embeddings import OpenAIEmbeddings

# vector store
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

# load api keys
from dotenv import load_dotenv
load_dotenv()

True

In [307]:
# document and document loaders
file_path = "../documents/2025-26_iihf_rulebook.pdf"

loader = PDFMinerLoader(file_path=file_path, mode='single', pages_delimiter='\n-------THIS IS A CUSTOM END OF PAGE-------\n')

docs = loader.load()



Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color becau

In [308]:
# document and document loaders
file = "../documents/2025-26_iihf_rulebook.pdf"

loader = PyPDFLoader(file)

docs = loader.load()

for i, d in enumerate(docs, start=1):
    d.metadata['source'] = "IIHF Rulebook 2025-26"
    
docs_cropped = docs[15:160]

In [309]:
parts = []
for i, d in enumerate(docs_cropped, start=1):
    parts.append(f"\n\n<<<PAGE {i}>>>\n{d.page_content.strip()}")

merged_text = "".join(parts)

In [None]:
merged_doc = Document(
    page_content=merged_text,
    metadata={
        "source": "IIHF Rulebook 2025-26",
        "page_count": len(docs_cropped),
    }
)





In [311]:
# sets the rule splitter, splitting on each individual rule in the rulebook

outer_rule_sep = r"RULE[ \u00A0]+\d{1,3}[ \u00A0]+[A-Z]+(?:[ \u00A0][A-Z]+)*"

rule_splitter = RecursiveCharacterTextSplitter(
    separators=[outer_rule_sep],
    chunk_size=1600,
    chunk_overlap=200,
    add_start_index=True,
    keep_separator=True,
    is_separator_regex=True
)

rule_splits = rule_splitter.split_documents([merged_doc])
len(rule_splits)

92

In [312]:
# Saves the overall rule used as metadata

rule_pattern = re.compile(r"RULE[ \u00A0]+\d{1,3}[ \u00A0]+[A-Z]+(?:[ \u00A0][A-Z]+)*")

for chunk in rule_splits:
    match = rule_pattern.search(chunk.page_content)
    if match:
        rule = match.group(0).strip()
        chunk.metadata["rule"] = rule
    else:
        chunk.metadata["rule"] = "UNKNOWN"

In [313]:
inner_rule_sep = r"\d{1,3}\.\d{1,2}\.\s+[A-Z]+(?:[ \u00A0][A-Z\-]+)*"

inner_rule_splitter = RecursiveCharacterTextSplitter(
    separators=[inner_rule_sep],
    chunk_size=800,
    chunk_overlap=80,
    add_start_index=True,
    keep_separator=True,
    is_separator_regex=True
)

inner_rule_splits = inner_rule_splitter.split_documents(rule_splits)
len(inner_rule_splits)

432

In [314]:
# Saves the inner rule used as metadata

inner_rule_pattern = re.compile(r"\d{1,3}\.\d{1,2}\.\s+[A-Z]+(?:[ \u00A0][A-Z\-]+)*")

for chunk in inner_rule_splits:
    match = inner_rule_pattern.search(chunk.page_content)
    if match:
        rule = match.group(0).strip()
        chunk.metadata["inner_rule"] = rule
    else:
        chunk.metadata["inner_rule"] = "UNKNOWN"

In [315]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [316]:
# Vector store
vector_store = FAISS.from_documents(documents=inner_rule_splits, embedding=embeddings)
vector_store.save_local("vs_faiss")


In [317]:
system_template = """You are an ice hockey rule assistant.

Follow these rules:
- Answer ONLY using the provided context below. If the answer is unknown or not in the context, say "I don't know".
- Be concise and use bullet points.
- After each bullet, include a citation using the metadata field: rule: <inner_rule>).
- Do not use outside knowledge.

You must format the answer as:
• <point> (rule: <inner_rule>)
• <point> (rule: <inner_rule>)
"""

prompt_template = ChatPromptTemplate.from_messages([
    ("system", system_template),
    ("system", "Context (use only what is inside the markers):\n---\n{context}\n---"),
    ("user", "{question}"),
])

In [None]:
# formattigng for what the llm recieves in context
def format_docs(docs):
    formatted_doc = []
    for i, d in enumerate(docs, start=1):
        rule = d.metadata.get("rule", "N/A")
        inner = d.metadata.get("inner_rule", "N/A")
        formatted_doc.append(
            f"[{i}] Rule: {rule} | Inner: {inner}\n{d.page_content.strip()}"
        )
    return "\n\n".join(formatted_doc)
    

# initalize retriver and llm
retriever = vector_store.as_retriever(search_kwargs={"k": 4})
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

    
rag_chain = (
    {
        "question": RunnablePassthrough(),
        "context": retriever | RunnableLambda(format_docs),
    }
| prompt_template 
| llm 
| StrOutputParser()
)



In [328]:
question = "When is a minor penalty like tripping or slashing a penaly shot instead?"
print(rag_chain.invoke(question))

• A minor penalty for an infraction like tripping or slashing can lead to a penalty shot if the infraction occurs when the puck is in the goal crease. (rule: 63.6)
• If a player's action causes a penalty shot, the minor penalty associated with that infraction will not be served unless it is a major or misconduct penalty. (rule: 24.6)
