In [45]:
from dotenv import load_dotenv

load_dotenv()

True

In [54]:
from glob import glob
from langchain import hub
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from langchain.prompts import PromptTemplate

In [47]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
    numDocs: int

In [None]:
class Aria:
    def __init__(self):
        self.llm = init_chat_model("gpt-4o-mini", model_provider="openai")
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        self.vector_store = InMemoryVectorStore(self.embeddings)

        self.chunk_size = 1000
        self.chunk_overlap = 200
        self.add_start_index = True

        self.prompt = hub.pull("rlm/rag-prompt")

        graph_builder = StateGraph(State).add_sequence(
            [self.retrieve, self.generate])
        graph_builder.add_edge(START, "retrieve")
        self.graph = graph_builder.compile()

    def embed_pdf(self, path):
        loader = PyPDFLoader(path)
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            add_start_index=self.add_start_index,
        )

        all_splits = text_splitter.split_documents(docs)

        _ = self.vector_store.add_documents(documents=all_splits)

    def retrieve(self, state: State):
        retrieved_docs = self.vector_store.similarity_search(
            state["question"], k=state["numDocs"])
        return {"context": retrieved_docs}

    def generate(self, state: State):
        docs_content = "\n\n".join(
            doc.page_content for doc in state["context"])
        messages = self.prompt.invoke(
            {"question": state["question"], "context": docs_content})
        response = self.llm.invoke(messages)
        return {"answer": response.content}

    def invoke(self, question, numDocs):
        result = self.graph.invoke({"question": question, "numDocs": numDocs})

        print(f"Context: {result['context']}\n\n")
        print(f"Answer: {result['answer']}")

    def summarize_papers(self):
        prompt_template = PromptTemplate.from_template("""
        You are an expert resume writer tasked with summarizing academic papers in LaTeX resume format.
        
        Based on the provided papers, create a summary for each paper following this EXACT LaTeX format:

        \\resumeSubheading
            {{Paper Title}}{{\\emph{{Conference/Journal Year}} }}
            {{Author Name(s)}}{{\\href{{GitHub Link}}{{\\underline{{Code}}}} $|$ \\href{{Paper Link}}{{\\underline{{Paper}}}}}}
            \\resumeItemListStart
                \\resumeItem{{First key contribution or achievement with \\textbf{{bold}} keywords}}
                \\resumeItem{{Second key contribution or achievement with \\textbf{{bold}} keywords}}
                \\resumeItem{{Third key contribution or achievement with \\textbf{{bold}} keywords}}
                \\resumeItem{{Fourth key contribution or achievement with \\textbf{{bold}} keywords}}
            \\resumeItemListEnd

        IMPORTANT GUIDELINES:
        1. Extract the paper title, conference/journal, and year from the documents
        2. Include "Amirreza Sokhankhosh" as the primary author and list co-authors
        3. For GitHub links, use placeholder "https://github.com/amirrezaskh/[repo-name]" if not found
        4. For paper links, use placeholder "https://paperlink.com" if not found
        5. Each resumeItem should highlight a specific contribution, methodology, or achievement
        6. Use \\textbf{{}} to bold important technical terms, methodologies, or key achievements
        7. Keep each resumeItem concise but informative (1-2 lines max)
        8. Focus on technical contributions, novel approaches, and measurable results
        9. Use action verbs like "Proposed", "Developed", "Designed", "Demonstrated", "Achieved"

        Papers to summarize:
        {context}

        Generate the LaTeX resume format for each paper found in the context:
        """)

        # Get all documents from vector store for comprehensive analysis
        all_docs_query = "summarize all papers research contributions methodology results"
        retrieved_docs = self.vector_store.similarity_search(
            all_docs_query, k=20)

        docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

        messages = prompt_template.invoke({"context": docs_content})
        response = self.llm.invoke(messages)

        return response.content

In [52]:
aria = Aria()

In [56]:
aria.summarize_papers()

"Sure! Below are the LaTeX formatted summaries for each specified paper based on the provided guidelines:\n\n```latex\n\\resumeSubheading\n    {Innovative Approaches to Algorithm Design}{\\emph{International Conference on Algorithms 2023} }\n    {Amirreza Sokhankhosh, Jane Doe, John Smith}{\\href{https://github.com/amirrezaskh/algorithm-design}{\\underline{Code}} $|$ \\href{https://paperlink.com}{\\underline{Paper}}}\n    \\resumeItemListStart\n        \\resumeItem{Developed a novel \\textbf{algorithmic framework} enhancing computational efficiency by \\textbf{30\\%} over previous methods}\n        \\resumeItem{Demonstrated the effectiveness of \\textbf{adaptive heuristics} in reducing time complexity in large-scale data processing}\n        \\resumeItem{Introduced an innovative \\textbf{multi-threading approach} that significantly improved performance in parallel computing scenarios}\n        \\resumeItem{Achieved \\textbf>state-of-the-art results} in multiple benchmark tests, validat

In [None]:
# Test the summarize_papers method
paper_summaries = aria.summarize_papers()
print(paper_summaries)

In [18]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [19]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [20]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [21]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./papers/PoCL.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
len(docs)

8

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split my paper post into {len(all_splits)} sub-documents.")

Split my paper post into 52 sub-documents.


In [23]:
document_ids = vector_store.add_documents(documents=all_splits)

In [24]:
from langchain import hub

# N.B. for non-US LangSmith endpoints, you may need to specify
# api_url="https://api.smith.langchain.com" in hub.pull.
prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [25]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

In [26]:
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

In [27]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [29]:
result = graph.invoke({"question": "What is Proof of Collaborative Learning?"})

print(f"Context: {result['context']}\n\n")
print(f"Answer: {result['answer']}")

Context: [Document(id='b9122ada-e399-43af-ae9a-e450b34c9d5c', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'TeX', 'creationdate': '2024-06-21T00:32:10+00:00', 'moddate': '2024-06-21T00:32:10+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'trapped': '/False', 'source': './papers/PoCL.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content='Proof-of-Collaborative-Learning: A Multi-winner\nFederated Learning Consensus Algorithm\nAmirreza Sokhankhosh\nUniversity of Manitoba, Winnipeg, Canada\nsokhanka@myumanitoba.ca\nSara Rouhani\nUniversity of Manitoba, Winnipeg, Canada\nsara.rouhani@umanitoba.ca\nAbstract—Regardless of their variations, blockchains require\na consensus mechanism to validate transactions, supervise added\nblocks, maintain network security, synchronize the network\nstate, and distribute incentives. Proof-of-Work (PoW), one of\nthe most influential implementations of co

In [58]:
from dotenv import load_dotenv
from glob import glob
from langchain import hub
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
from IPython.display import Image, display


class State(TypedDict):
    job: str
    resume: str
    prompt: str
    context: List[Document]
    answer: str


class Aria:
    def __init__(self):
        self.llm = init_chat_model("gpt-4o-mini", model_provider="openai")
        self.embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        self.vector_store = InMemoryVectorStore(self.embeddings)

        self.chunk_size = 1000
        self.chunk_overlap = 200
        self.add_start_index = True

        self.prompt = hub.pull("rlm/rag-prompt")

        self.build_graph()

    def embed_pdf(self, path):
        loader = PyPDFLoader(path)
        docs = loader.load()

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            add_start_index=self.add_start_index,
        )

        all_splits = text_splitter.split_documents(docs)

        _ = self.vector_store.add_documents(documents=all_splits)


    def retrieve(self, state: State):
        retrieved_docs = self.vector_store.similarity_search(
            state["job"])
        return {"context": retrieved_docs}

    def generate(self, state: State):
        docs_content = "\n\n".join(
            doc.page_content for doc in state["context"])
        messages = self.prompt.invoke(
            {"question": state["question"], "context": docs_content})
        response = self.llm.invoke(messages)
        return {"answer": response.content}
    

    def build_graph(self):
        graph_builder = StateGraph(State)

        graph_builder.add_node("retrieve", self.retrieve)
        graph_builder.add_node("summarize_experiences", self.summarize_experiences)
        graph_builder.add_node("summarize_technical_skills", self.summarize_technical_skills)
        graph_builder.add_node("select_projects", self.select_projects)
        graph_builder.add_node("summarize_projects", self.summarize_projects)
        graph_builder.add_node("generate_highlights", self.generate_highlights)
        graph_builder.add_node("generate_cover_letter", self.generate_cover_letter)

        graph_builder.add_edge(START, "summarize_experiences")
        graph_builder.add_edge("summarize_experiences", "summarize_technical_skills")
        graph_builder.add_edge("summarize_technical_skills", "select_projects")
        graph_builder.add_edge("select_projects", "summarize_projects")
        graph_builder.add_edge("summarize_projects", "generate_highlights")
        graph_builder.add_edge("retrieve", "generate_cover_letter")
        graph_builder.add_edge("generate_highlights", "generate_cover_letter")

        # graph_builder.set_finish("generate_cover_letter")

        self.graph = graph_builder.compile()

        display(Image(self.graph.get_graph().draw_mermaid_png()))


    def summarize_experiences(self, state: State):
        pass

    def summarize_technical_skills(self, state: State):
        pass

    def select_projects(self, state: State):
        pass

    def summarize_projects(self, state: State):
        pass

    def generate_highlights(self, state: State):
        pass

    def generate_cover_letter(self, state: State):
        pass

load_dotenv()

aria = Aria()

ValueError: Failed to reach https://mermaid.ink/ API while trying to render your graph. Status code: 204.

To resolve this issue:
1. Check your internet connection and try again
2. Try with higher retry settings: `draw_mermaid_png(..., max_retries=5, retry_delay=2.0)`
3. Use the Pyppeteer rendering method which will render your graph locally in a browser: `draw_mermaid_png(..., draw_method=MermaidDrawMethod.PYPPETEER)`