In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

import os
import glob
from pathlib import Path

In [9]:
# get the pdf files path
docs_dir = "../documents"
pdf_files = glob.glob(f"{docs_dir}/*.pdf")


# add pdf files to docs after loading with pdfLoader.
docs = []
for f in pdf_files:
    docs += PyPDFLoader(f).load()
    

In [None]:

docs[0].metadata

{'producer': 'Adobe PDF Library 17.0',
 'creator': 'Adobe InDesign 19.5 (Windows)',
 'creationdate': '2024-10-07T12:50:09+03:00',
 'moddate': '2024-10-07T12:50:21+03:00',
 'trapped': '/False',
 'source': '../documents\\2024_iihf_situationhandbook_07102024-v2_0.pdf',
 'total_pages': 168,
 'page': 0,
 'page_label': '1'}

In [35]:
# split text into chunks

HEADING_SEPARATORS = [
    r"\nSITUATION\s+\d+\.\d+[^\n]*\n",     # Situation Handbook entries
    r"\nRULE\s+\d+[^\n]*\n",               # Rulebook: "RULE 60 High-sticking"
    r"\nSECTION\s+\d+[^\n]*\n",            # "SECTION 08. STICK INFRACTIONS"
    r"\n[A-Z][A-Z &/’'–\-]{4,}\n",         # ALL-CAPS headings like "EQUIPMENT"
    r"\n{2,}",                             # blank lines (paragraph breaks)
    r"\n",                                 # single newline
    r" "                                   # as last resort
]

coarse_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1800,
    chunk_overlap=50,
    separators=HEADING_SEPARATORS,
    keep_separator=True,
    is_separator_regex=True
    )


fine_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=[r"\n{2,}", r"\n", r" "],
    keep_separator=True,
    is_separator_regex=True
)

coarse_chunks = coarse_text_splitter.split_documents(docs)
final_chunks = []
for d in coarse_chunks:
    
    # defines metadata
    meta = d.metadata
    print(meta)
    
    # split content of each coarse chunk further
    sub_docs = fine_splitter.split_documents([d])
    final_chunks.extend(sub_docs)



print(f"Coarse chunks: {len(coarse_chunks)} | Final chunks: {len(final_chunks)}")


{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-07T12:50:09+03:00', 'moddate': '2024-10-07T12:50:21+03:00', 'trapped': '/False', 'source': '../documents\\2024_iihf_situationhandbook_07102024-v2_0.pdf', 'total_pages': 168, 'page': 0, 'page_label': '1'}
{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-07T12:50:09+03:00', 'moddate': '2024-10-07T12:50:21+03:00', 'trapped': '/False', 'source': '../documents\\2024_iihf_situationhandbook_07102024-v2_0.pdf', 'total_pages': 168, 'page': 1, 'page_label': '2'}
{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-07T12:50:09+03:00', 'moddate': '2024-10-07T12:50:21+03:00', 'trapped': '/False', 'source': '../documents\\2024_iihf_situationhandbook_07102024-v2_0.pdf', 'total_pages': 168, 'page': 2, 'page_label': '3'}
{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesig

In [31]:

print(final_chunks[102])

page_content='the legal puck is located in the neutral zone. Rule 13.3.\nRULE 14  ADJUSTMENT TO CLOTHING OR EQUIPMENT\nSITUATION 14.1\nA goalkeeper requires adjustments to its leg pads. Is the goalkeeper allowed to proceed to the Players’ Bench, or must the goal-\nkeeper remain at the net? \nANSWER\nThe goalkeeper may go to the Players’ Bench, with the officials’ permission, to have its equipment repaired provided the antici -\npated delay is minor in nature. If the delay becomes significant, the goalkeeper must be replaced by a substitute until the next \nstoppage of play. Rule 14.1.\n03   SECTION · EQUIPMENT' metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-07T12:50:09+03:00', 'moddate': '2024-10-07T12:50:21+03:00', 'trapped': '/False', 'source': '../documents\\2024_iihf_situationhandbook_07102024-v2_0.pdf', 'total_pages': 168, 'page': 25, 'page_label': '26'}


page_content='TABLE OF CONTENTS
IIHF  SITUATION HANDBOOK 2024/25 – SECTION 04  28
TYPES OF PENAL TIES  
RULE 15  CALLING OF PENALTIES
SITUATION 15.1
A minor penalty is being signaled by the Referee against A44. The teammate to A44, A16, who is in its defending zone gains pos-
session and control of the puck and as the whistle blows for the delayed penalty, A16 shoots the puck which goes over the glass 
and into the crowd. How does the Referee handle this situation? Where do you find this in the rule book? 
ANSWER
Since the whistle is blown to stop play as soon as Team A gains possession and control of the puck, no additional penalty is 
assessed to Team A16 for shooting the puck over the glass into the crowd. Rule 15.1, paragraph 2. NOTE: A penalty may still be' metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-07T12:50:09+03:00', 'moddate': '2024-10-07T12:50:21+03:00', 'trapped': '/False', 'source': '../documents\\2024_iihf_situationhandbook_07102024-v2_0.pdf', 'total_pages': 168, 'page': 27, 'page_label': '28'}

In [13]:
# get embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [14]:
# Vector Store (Chroma)
vector_store = Chroma.from_documents(final_chunks, embeddings, persist_directory="../vectorstore")

In [17]:
# retriever
retriever = vector_store.as_retriever(search_kwargs={"k":6})

In [29]:
retriever.invoke("What is a tripping penalty?")

[Document(metadata={'creationdate': '2025-06-30T18:35:53+03:00', 'moddate': '2025-06-30T18:36:31+03:00', 'creator': 'Adobe InDesign 20.2 (Windows)', 'page': 101, 'trapped': '/False', 'page_label': '102', 'source': '../documents\\2025-26_iihf_rulebook_30062025-v1.pdf', 'total_pages': 228, 'producer': 'Adobe PDF Library 17.0'}, page_content='RULE 57  TRIPPING\n57.1. TRIPPING\nA Player shall not place the stick, or any part of their body in such a manner that causes their opponent to trip or fall.\nAccidental trips which occur simultaneously with a completed play will not be penalized.\nAccidental trips occurring simultaneously with or after a stoppage of play will not be penalized.\n57.2. MINOR PENAL TY\nThe Referee shall, at their discretion, assess a minor penalty, based on the severity of the infraction, to any Player who place their \nstick or any part of their body in such a manner that it shall cause their opponent to trip and fall.\n57.3. MAJOR PENAL TY\nThe Referee, at their disc

In [21]:
llm = OllamaLLM(model="llama3.1:8b")

prompt = ChatPromptTemplate.from_template(
    """
    You are a hockey rules assistant.
    Use ONLY the provided context.
    If the answer is not in the context, say you don't know.

    For each bullet:
    - First, quote the decisive line from the rule.
    - Then, one-sentence explanation.
    - End with a citation like (Rule X.Y, p.N).

    Question: {question}
    Context:
    {context}
    """
)


def format_docs(docs):
    """Format a list of documents into a readable string with sources and page numbers."""
    formatted_docs = []

    for d in docs:
        content = d.page_content
        source = d.metadata.get("source", "?")
        page = d.metadata.get("page", "?")

        formatted_doc = f"- {content}\n  [source: {source}, page {page}]"
        formatted_docs.append(formatted_doc)

    return "\n\n".join(formatted_docs)


rag_chain = (
    {"context": (lambda x: x["question"]) | retriever | format_docs, 
     "question": lambda x: x["question"]}
    | prompt
    | llm
    | StrOutputParser()
)

In [27]:
q = "Which penalties can result in a major penalty without an automatic game misconduct?"
print(rag_chain.invoke({"question": q}))


• The Referee, at their discretion, may assess a major penalty and an automatic game misconduct penalty if, in their judgment, the 
    Player kicks or attempts to kick an opponent.
    A player who kicks an opponent can be given both a major penalty and an automatic game misconduct penalty. (Rule 49.3, p.N)

Note: There is no explicit list of penalties that can result in a major penalty without an automatic game misconduct in the provided context. However, based on Rule 49.3, it seems that kicking or attempting to kick an opponent can lead to such a situation.

Therefore, I don't know the answer to the original question about which other penalties can result in a major penalty without an automatic game misconduct.
