In [107]:
import os
from dotenv import load_dotenv

load_dotenv()

CLAUDE_KEY = os.getenv("CLAUDE_KEY")
MODEL = "claude-3-7-sonnet-20250219"
#MODEL = "llama3.1"


In [108]:
from langchain_anthropic import ChatAnthropic
from langchain_ollama.llms import OllamaLLM
from langchain_ollama import OllamaEmbeddings


if MODEL.startswith("claude"):
    llm = ChatAnthropic(model=MODEL, api_key=CLAUDE_KEY)
else:
    llm = OllamaLLM(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL)
    

    
llm.invoke("tell me a joke")

AIMessage(content="Why don't scientists trust atoms?\n\nBecause they make up everything!", additional_kwargs={}, response_metadata={'id': 'msg_015dpc5KNBaKV1cNB3wgsyzz', 'model': 'claude-3-7-sonnet-20250219', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 11, 'output_tokens': 17}, 'model_name': 'claude-3-7-sonnet-20250219'}, id='run-aa32c474-ac9c-4337-abc1-d060617210ca-0', usage_metadata={'input_tokens': 11, 'output_tokens': 17, 'total_tokens': 28, 'input_token_details': {'cache_read': 0, 'cache_creation': 0}})

In [109]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = llm | parser
chain.invoke("tell me a joke")

"Why don't scientists trust atoms?\n\nBecause they make up everything!"

In [63]:
from docling.document_converter import DocumentConverter
from langchain.schema import Document
import os

converter = DocumentConverter()

def load_pdfs(path):
    documents = []
    ctr = 1

    for filename in os.listdir(path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(path, filename)

            result = converter.convert(file_path)
            content = result.document.export_to_markdown()

            doc = Document(
                page_content = content,
                metadata = {"source": file_path}
            )

            print(f"File {ctr}: {filename} loaded.")
            ctr += 1
            documents.append(doc)
    return documents


In [64]:
all_documents = load_pdfs("data/preprocessed")
len(all_documents)

File 1: localsearch.pdf loaded.
File 2: informed_search.pdf loaded.
File 3: naivebayes.pdf loaded.
File 4: adversarialsearch.pdf loaded.
File 5: Search.pdf loaded.
File 6: intelligentagents.pdf loaded.
File 7: markov.pdf loaded.
File 8: rl.pdf loaded.
File 9: ml.pdf loaded.
File 10: AI_Intro.pdf loaded.


10

In [65]:
def export_markdown_files(documents, output_dir="data/processed_markdown"):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    for doc in documents:
        # Get the original filename from the source path
        file_path = doc.metadata["source"]
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        
        # Create markdown file path
        markdown_path = os.path.join(output_dir, f"{base_name}.md")
        
        # Write content to markdown file
        with open(markdown_path, 'w', encoding='utf-8') as f:
            f.write(doc.page_content)
        
        # Update metadata to include markdown path
        doc.metadata["markdown_path"] = markdown_path
        
        print(f"Exported: {markdown_path}")
    
    return documents

In [66]:
all_documents = export_markdown_files(all_documents)


Exported: data/processed_markdown/localsearch.md
Exported: data/processed_markdown/informed_search.md
Exported: data/processed_markdown/naivebayes.md
Exported: data/processed_markdown/adversarialsearch.md
Exported: data/processed_markdown/Search.md
Exported: data/processed_markdown/intelligentagents.md
Exported: data/processed_markdown/markov.md
Exported: data/processed_markdown/rl.md
Exported: data/processed_markdown/ml.md
Exported: data/processed_markdown/AI_Intro.md


In [72]:
def count_characters(documents):
    char_counts = {}
    
    for doc in documents:
        source = doc.metadata.get("source", "Unknown")
        char_count = len(doc.page_content)
        char_counts[source] = char_count
        
        # Print as we go
        print(f"{os.path.basename(source)}: {char_count} characters")
    
    return char_counts

# Usage
char_counts = count_characters(all_documents)

localsearch.pdf: 10306 characters
informed_search.pdf: 5997 characters
naivebayes.pdf: 29500 characters
adversarialsearch.pdf: 4210 characters
Search.pdf: 12642 characters
intelligentagents.pdf: 8101 characters
markov.pdf: 15733 characters
rl.pdf: 11718 characters
ml.pdf: 10704 characters
AI_Intro.pdf: 6980 characters


In [73]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

def split_documents_by_structure(documents):
    # First try to split by markdown headers
    headers_to_split_on = [
        ("#", "section"),
        ("##", "subsection"),
        ("###", "subsubsection"),
    ]
    header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    # For additional splitting of large sections
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,  # Larger than default since these are educational materials
        chunk_overlap=150,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    all_splits = []
    
    for doc in documents:
        # Try header splitting first
        try:
            header_splits = header_splitter.split_text(doc.page_content)
            
            # Check if any splits are still too large
            final_splits = []
            for split in header_splits:
                if len(split.page_content) > 2000:  # If section is still large
                    smaller_splits = text_splitter.split_documents([Document(
                        page_content=split.page_content,
                        metadata={**doc.metadata, **split.metadata}
                    )])
                    final_splits.extend(smaller_splits)
                else:
                    final_splits.append(Document(
                        page_content=split.page_content,
                        metadata={**doc.metadata, **split.metadata}
                    ))
                    
            all_splits.extend(final_splits)
            
        except Exception as e:
            # Fallback to regular splitting if header splitting fails
            print(f"Header splitting failed for {doc.metadata.get('source')}, using regular splitting")
            regular_splits = text_splitter.split_documents([doc])
            all_splits.extend(regular_splits)
    
    return all_splits

all_splits = split_documents_by_structure(all_documents)
len(all_splits)

314

In [94]:
from langchain.prompts import PromptTemplate

template = """You are an AI study assistant designed to help students with questions about their course materials. Your primary function is to provide accurate answers based solely on the information contained in the retrieved chunks of course documents. It is crucial that you do not add any information from your own knowledge or make up any details that are not explicitly stated in the provided text.

Here are the retrieved chunks of course material:
<context>
{context}
</context>

The student has asked the following question:
<student_question>
{question}
</student_question>

To answer the student's question, follow these steps:
1. Carefully read and analyze the retrieved chunks of text.
2. Identify any information directly relevant to the student's question.
3. Formulate an answer using only the information found in the retrieved chunks.
4. If you find conflicting information in different chunks, state this clearly in your answer.

Format your response as follows:
1. Begin with a <relevant_info> tag, where you will list the specific pieces of information from the chunks that are relevant to answering the question. Include the chunk number or identifier for each piece of information.
2. Follow this with your <answer> tag, where you will provide a clear and concise answer to the student's question based solely on the information you listed in the relevant_info section.

It is imperative that you only use information explicitly stated in the retrieved chunks. Do not add any additional information, explanations, or examples that are not present in the provided text, even if you believe them to be true or helpful.

If the question cannot be fully answered using only the information in the retrieved chunks, state this clearly in your answer. Provide whatever partial information you can from the chunks, and explain what specific information is missing to fully answer the question.

If the retrieved chunks contain no information relevant to the student's question, respond with:
<answer>I apologize, but I couldn't find any information in the provided course materials that answers your question about [brief restatement of the question]. If you believe this topic should be covered in your course, you may want to consult your instructor or additional course resources.</answer>

Remember, your role is to assist based strictly on the course materials provided, not to be a general knowledge resource. Accuracy and adherence to the given information are your top priorities."""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="here is a question") )

You are an AI study assistant designed to help students with questions about their course materials. Your primary function is to provide accurate answers based solely on the information contained in the retrieved chunks of course documents. It is crucial that you do not add any information from your own knowledge or make up any details that are not explicitly stated in the provided text.

Here are the retrieved chunks of course material:
<context>
Here is some context
</context>

The student has asked the following question:
<student_question>
here is a question
</student_question>

To answer the student's question, follow these steps:
1. Carefully read and analyze the retrieved chunks of text.
2. Identify any information directly relevant to the student's question.
3. Formulate an answer using only the information found in the retrieved chunks.
4. If you find conflicting information in different chunks, state this clearly in your answer.

Format your response as follows:
1. Begin with

In [95]:
chain = prompt | llm | parser 

In [96]:
chain.invoke(
    {
        "context": "The name I was given was Ant Man",
        "question": "What is my name?",
    }
)

'<relevant_info>\n1. The name I was given was Ant Man (Context 1)\n</relevant_info>\n\n<answer>\nMy name is Ant Man.\n</answer>'

In [77]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(
    all_splits, 
    embedding=embeddings
)

In [111]:
retriever = vectorstore.as_retriever()

retriever.invoke("PEAS")

[Document(metadata={'source': 'data/preprocessed/intelligentagents.pdf', 'markdown_path': 'data/processed_markdown/intelligentagents.md', 'subsection': 'Intelligent Agents'}, page_content='Chapter 2'),
 Document(metadata={'source': 'data/preprocessed/rl.pdf', 'markdown_path': 'data/processed_markdown/rl.md', 'subsection': 'Example: Model-Based Learning'}, page_content='Input Policy π'),
 Document(metadata={'source': 'data/preprocessed/markov.pdf', 'markdown_path': 'data/processed_markdown/markov.md', 'subsection': 'Example: Grid World'}, page_content='<!-- image -->  \nExit'),
 Document(metadata={'source': 'data/preprocessed/AI_Intro.pdf', 'markdown_path': 'data/processed_markdown/AI_Intro.md', 'subsection': 'Artificial Intelligence'}, page_content='An Introduction  \nRussell and Norvig')]

In [112]:
from operator import itemgetter 

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain = (
    {
        "context": itemgetter("question") | retriever | format_docs,
        "question": itemgetter("question")
    }
    | prompt
    | llm
    | parser
)

chain.invoke({"question": "What does PEAS stand for?"})

"<relevant_info>\nI've carefully examined all the provided chunks of course material, but I don't see any information that defines or explains what the acronym PEAS stands for.\n</relevant_info>\n\n<answer>I apologize, but I couldn't find any information in the provided course materials that answers your question about what PEAS stands for. If you believe this topic should be covered in your course, you may want to consult your instructor or additional course resources.</answer>"

In [113]:
for s in chain.stream({"question": "What does PEAS stand for?"}):
    print(s, end="", flush=True)

<relevant_info>
After reviewing all the provided chunks, I cannot find any information that defines or explains what PEAS stands for.
</relevant_info>

<answer>I apologize, but I couldn't find any information in the provided course materials that answers your question about what PEAS stands for. The retrieved chunks focus on genetic algorithms, 8-queens problem, value iteration, and brief mentions of state representation, but there is no mention of PEAS or its definition. If you believe this topic should be covered in your course, you may want to consult your instructor or additional course resources.</answer>

In [None]:
#next steps:

'''
get rag to work
make the overview pdf 
start the second project
'''