<h2>Load in PDF</h2>
Use PyPDFLoader to load in the MTG Rules and split into pages

In [1]:
from langchain.document_loaders import PyPDFLoader
import re

loader = PyPDFLoader("MagicCompRules 20240607.pdf")
pages = loader.load_and_split()

<h2>Split the PDF into relevant sections</h2>

In [None]:
def remove_spaces_after_newline(text):
    # This regex pattern matches a newline character followed by one or more spaces
    pattern = r'\n\s+'
    # Replace the matched pattern with just a newline character
    result = re.sub(pattern, '\n', text)
    return result

table_of_contents = remove_spaces_after_newline("\n".join(page.page_content for page in pages[1:4]))
full_rules = remove_spaces_after_newline("\n".join(page.page_content for page in pages[4:269]))
table_of_contents

'Contents  \n1. Game Concepts  \n100. General  \n101. The Magic  Golden Rules  \n102. Players  \n103. Starting the Game  \n104. Ending the Game  \n105. Colors  \n106. Mana  \n107. Numbers and Symbols  \n108. Cards  \n109. Objects  \n110. Permanents  \n111. Tokens  \n112. Spells  \n113. Abilities  \n114. Emblems  \n115. Targets  \n116. Special Actions  \n117. Timing and Priority  \n118. Costs  \n119. Life  \n120. Damage \n121. Drawing a Card  \n122. Counters  \n123. Stickers  \n2. Parts of a Card  \n200. General  \n201. Name  \n202. Mana Cost and Color  \n203. Illustration  \n204. Color Indicator  \n205. Type Line  \n206. Expansion Symbol  \n207. Text Box  \n208. Power/Toughness  \n209. Loyalty  \n210. Defense  \n211. Hand Modifier  \n212. Life Modifier  \n213. Information Below the Text Box  \n3. Card Types  \n300. General  \n301. Artifacts  \n302. Creatures  \n303. Enchantments  \n304. Instants  \n305. Lands  \n306. Planeswalkers  \n307. Sorceries  \n308. Kindred s \n309. Dungeons  \n

In [None]:
def split_contents(toc_string):
    lines = toc_string.split('\n')
    chapters = {}
    current_chapter = None

    for line in lines[1:-2]:  # Skip the "Contents" line
        if line[0].isdigit() and '.' in line:
            chapter_num, chapter_name = line.split('.', 1)
            if len(chapter_num) == 1:
                current_chapter = line.strip()
                chapters[current_chapter] = []
            elif current_chapter:
                chapters[current_chapter].append(line.strip())
        elif line.strip() and current_chapter:
            chapters[current_chapter].append(line.strip())

    return chapters

table_of_contents = split_contents(table_of_contents)

AttributeError: 'dict' object has no attribute 'split'

In [None]:
def split_text_large(text):
    pattern = r'(?:^|\n)(\d{1,2}\.\s+.*?\n(?:(?!\n\d{1,2}\.\s+)[\s\S])*)'
    matches = re.findall(pattern, text, re.MULTILINE)
    return [section.strip() for section in matches]

chunks = split_text_large(full_rules)


In [None]:
def split_subsections(chunk):
    lines = chunk.split('\n')
    chapter_title = lines[0].rstrip()
    text = ('\n').join(lines[1:])
    chunk_rules = []
    rules = table_of_contents[chapter_title]
    
    for i in range(len(rules)):
        current_rule = rules[i]
        start_index = text.find(current_rule)
        #Due to some text parsing errors, random spaces are added and an exact match cannot be found, in that case, matchign the first 7 characters tends to suffice
        if (start_index == -1):
            start_index = text.find(current_rule[:7])
        
        # Find the end index of the current rule's section
        if i < len(rules) - 1:
            # If it's not the last rule, find the start of the next rule
            next_rule = rules[i + 1]
            end_index = text.find(next_rule)
        else:
            end_index = len(text)
        
        rule_section = text[start_index:end_index].rstrip()
        chunk_rules.append(chapter_title + '\n' + rule_section)
    return chunk_rules


In [None]:
import fitz

pdf_document = fitz.open("MagicCompRules 20240607 Glossary.pdf")

# Extract text from each page
glossary = ""
for page_num in range(pdf_document.page_count):
    page = pdf_document.load_page(page_num)
    glossary += page.get_text()
glossary = '\n' + ('\n').join(glossary.split('\n')[1:])

In [None]:
def split_glossary(text):
    entries = text.split('\n \n')
    glossary_entries = {}
    for entry in entries:
        parts = entry.split('\n', 1)
        if len(parts) == 2:
            term, definition = parts
            glossary_entries[term.strip()] = definition.strip()
    return glossary_entries
glossary_split = split_glossary(glossary)
glossary_split

{'Abandon': 'To turn a face-up ongoing scheme card face down and put it on the bottom of its owner’s scheme deck. See \nrule 701.26, “Abandon.”',
 'Ability': '1. Text on an object that explains what that object does or can do. \n2. An activated or triggered ability on the stack. This kind of ability is an object. \nSee rule 113, “Abilities,” and section 6, “Spells, Abilities, and Effects.”',
 'Ability Word': 'An italicized word with no rules meaning that ties together abilities on different cards that have similar \nfunctionality. See rule 207.2c.',
 'Absorb': 'A keyword ability that prevents damage. See rule 702.64, “Absorb.”',
 'Activate': 'To put an activated ability onto the stack and pay its costs, so that it will eventually resolve and have its \neffect. See rule 602, “Activating Activated Abilities.”',
 'Activated Ability': 'A kind of ability. Activated abilities are written as “[Cost]: [Effect.] [Activation instructions (if any).]” See \nrule 113, “Abilities,” and rule 602, “Ac

In [None]:
from langchain_core.documents import Document
from langchain.vectorstores import Chroma

def rules_to_docs(chunks):
    documents = []
    
    for chunk in chunks:
        chunk_rules = split_subsections(chunk)
        for rule in chunk_rules:
            rule_text = rule.split('\n')
            chapter = rule_text[0].rstrip()
            rule_title = rule_text[1].rstrip()
            id = rule_title
            #can test if removing \n chars help
            rule_text = ('\n').join(rule_text[2:])
            documents.append(Document(page_content=rule_text, metadata={"chapter": chapter, "rule_title": rule_title}, id=int(rule_title[:3])))
    return documents



In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings

rule_documents = rules_to_docs(chunks)
ef = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

persist_directory_rng_ONNXMiniLM = 'db-rules-glossary-ONNXMiniLM_L6_V2'
vectordb = Chroma.from_documents(documents=rule_documents, embedding=ef, persist_directory=persist_directory_rng_ONNXMiniLM)



  warn_deprecated(
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
from langchain_nomic import NomicEmbeddings

embeddings = NomicEmbeddings(
    model='nomic-embed-text-v1.5',
    inference_mode='local',
    device='gpu',
)
persist_directory_rng_nomic1_5 = 'db-rules-glossary-nomic-embed-text-v1.5'
vectordb = Chroma.from_documents(documents=rule_documents, embedding=embeddings, persist_directory=persist_directory_rng_nomic1_5)


: 

: 