In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

Define LLM

In [4]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
# llm.invoke("Tell me a joke about cats")

# Process out PDF

## Load PDF Document

In [5]:
loader = PyPDFLoader("data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf")
pages = loader.load()
pages

Ignoring wrong pointing object 18 0 (offset 0)


[Document(metadata={'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'creator': 'Preview', 'creationdate': "D:20240909152042Z00'00'", 'author': 'Thu Vu', 'moddate': "D:20240910141854Z00'00'", 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='APPLIED COGNITIVE PSYCHOLOGY\nAppl. Cognit. Psychol. 20: 139–156 (2006)\nPublished online 31 October 2005 in Wiley InterScience\n(www.interscience.wiley.com) DOI: 10.1002/acp.1178\nConsequences of Erudite Vernacular Utilized Irrespective\nof Necessity: Problems with Using Long Words Needlessly\nDANIEL M. OPPENHEIMER*\nPrinceton University, USA\nSUMMARY\nMost texts on writing style encourage authors to avoid overly-complex words. However, a majority\nof undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give\nthe impression of intelligen

## Split document

We already split the data into pages, but it is still too big for us to process, so we need to split it into even smaller chunks. The idea is that when we split the document into smaller chunks, each chunks would be more focus and more relevant when we query the documents.

If the chunks are too big, they might have redundant information, and if the chunks are too small, they mightnot contains enough contact for the LLM to generate good answer.

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " "],
)
chunks = text_splitter.split_documents(pages)

In [7]:
print(
    chunks[0].page_content
    # 'This is the content of the first chunk. It has been split using the specified parameters.'
)

APPLIED COGNITIVE PSYCHOLOGY
Appl. Cognit. Psychol. 20: 139–156 (2006)
Published online 31 October 2005 in Wiley InterScience
(www.interscience.wiley.com) DOI: 10.1002/acp.1178
Consequences of Erudite Vernacular Utilized Irrespective
of Necessity: Problems with Using Long Words Needlessly
DANIEL M. OPPENHEIMER*
Princeton University, USA
SUMMARY
Most texts on writing style encourage authors to avoid overly-complex words. However, a majority
of undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give
the impression of intelligence. This paper explores the extent to which this strategy is effective.
Experiments 1–3 manipulate complexity of texts and ﬁnd a negative relationship between complex-
ity and judged intelligence. This relationship held regardless of the quality of the original essay, and
irrespective of the participants’ prior expectations of essay quality. The negative impact of
complexity was mediated by processing ﬂuency. Experiment 4 di

## Create embeddings

In [8]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings


embedding_function = get_embedding_function()
# test_vector = embedding_function.embed_query("cat")

In [22]:
import uuid


def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []

    unique_chunks = []
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function,
        persist_directory=vectorstore_path,
    )

    vectorstore.persist()

    return vectorstore

In [23]:
# Create vectorstore
vectorstore = create_vectorstore(
    chunks=chunks,
    embedding_function=embedding_function,
    vectorstore_path="vectorstore_new",
)

# Query for relevant data

In [24]:
# Load vectorstore
vectorstore = Chroma(
    persist_directory="vectorstore_new", embedding_function=embedding_function
)

In [25]:
# Retrieve all documents
docs = vectorstore.get()

print("Document IDs:", docs["ids"])
print("Document Metadata:", docs["metadatas"])
print("Document Content:", docs["documents"])

Document IDs: ['646e5ed1-3179-5c6d-9e1b-95e301b7b677', '502a6ed1-39df-58d5-adf7-637f5af5b0ad', '34dc34b7-987c-5e3f-929f-16aabd02078b', 'be3e1e9d-bb45-5b4c-941f-601460baa26e', '867cdee6-eab7-55f7-a2d5-67f7731a7ef8', '109f7472-54d6-59fc-ace2-30d0eefd7d74', 'db286a7d-02f3-5da7-a589-65d473502de3', '8fa9073b-94ce-576e-a76f-3eb2d035cf8e', '3c80ab1f-f272-5ff3-96b1-65da3ed829ea']
Document Metadata: [{'author': 'Thu Vu', 'creationdate': "D:20240909152042Z00'00'", 'creator': 'Preview', 'moddate': "D:20240910141854Z00'00'", 'page': 0, 'page_label': '1', 'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'total_pages': 3}, {'author': 'Thu Vu', 'creationdate': "D:20240909152042Z00'00'", 'creator': 'Preview', 'moddate': "D:20240910141854Z00'00'", 'page': 0, 'page_label': '1', 'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext,

In [26]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the paper?")
relevant_chunks

[Document(metadata={'author': 'Thu Vu', 'creationdate': "D:20240909152042Z00'00'", 'creator': 'Preview', 'moddate': "D:20240910141854Z00'00'", 'page': 1, 'page_label': '2', 'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'total_pages': 3}, page_content='was unnecessary and thus surprising readers with the relative disﬂuency of the text.\nBoth the experts and prevailing wisdom present plausible views, but which (if either) is\ncorrect? The present paper provides an empirical investigation of the strategy of complex-\nity, and ﬁnds such a strategy to be unsuccessful. Five studies demonstrate that the loss of\nﬂuency due to needless complexity in a text negatively impacts raters’ assessments of the\ntext’s authors.\nEXPERIMENT 1\nExperiment 1 aimed to answer several simple questions. First, does increasing the\ncomplexity of tex

In [27]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

# Generate Response

In [28]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(
    context=context_text, question="What is the title of the paper?"
)
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

was unnecessary and thus surprising readers with the relative disﬂuency of the text.
Both the experts and prevailing wisdom present plausible views, but which (if either) is
correct? The present paper provides an empirical investigation of the strategy of complex-
ity, and ﬁnds such a strategy to be unsuccessful. Five studies demonstrate that the loss of
ﬂuency due to needless complexity in a text negatively impacts raters’ assessments of the
text’s authors.
EXPERIMENT 1
Experiment 1 aimed to answer several simple questions. First, does increasing the
complexity of text succeed in making the author appear more intelligent? Second, to
what extent does the success of this strategy depend on the quality of the original, simpler
writing? Finally, if the strategy is unsuccessful, is th

In [29]:
llm.invoke(prompt)

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 1117, 'total_tokens': 1150, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_b376dfbbd5', 'id': 'chatcmpl-BFZBsriXm5A17vvmNsa7cmuDl28mG', 'finish_reason': 'stop', 'logprobs': None}, id='run-87e63c91-2ffe-4daa-9d3b-d4ce1d902ed2-0', usage_metadata={'input_tokens': 1117, 'output_tokens': 33, 'total_tokens': 1150, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Using Langchain Expression Languages

It is a way to chain all the functions and steps together 

In [30]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm
)
rag_chain.invoke("What's the title of this paper?")

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 1111, 'total_tokens': 1144, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_b376dfbbd5', 'id': 'chatcmpl-BFZBv6doN51f61DIKZ35bigvmTqU3', 'finish_reason': 'stop', 'logprobs': None}, id='run-91467c66-a3c8-4301-933a-7d0c43e8ab63-0', usage_metadata={'input_tokens': 1111, 'output_tokens': 33, 'total_tokens': 1144, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Generate Structured Output

In [31]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""

    answer: str = Field(description="Answer to question")
    sources: str = Field(
        description="Full direct text chunk from the context used to answer the question"
    )
    reasoning: str = Field(
        description="Explain the reasoning of the answer based on the sources"
    )


class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""

    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [32]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | llm.with_structured_output(ExtractedInfo, strict=True)
)

rag_chain.invoke(
    "Give me the title, summary, publication date, authors of the research paper."
)



ExtractedInfo(paper_title=AnswerWithSources(answer='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', sources='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', reasoning='The title is explicitly stated in the context provided.'), paper_summary=AnswerWithSources(answer="This paper explores the extent to which the deliberate use of complex vocabulary impacts judgments of intelligence. Experiments reveal a negative relationship between text complexity and judged intelligence, affecting perceptions regardless of essay quality or participants' expectations. Ultimately, it concludes that clarity and simplicity in writing are preferable.", sources='SUMMARY Most texts on writing style encourage authors to avoid overly-complex words. However, a majority of undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give the impression

# Transform input into dataframe

In [33]:
structured_response = rag_chain.invoke(
    "Give me the title, summary, publication date, authors of the research paper."
)
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]["answer"])
    source_row.append(df[col][0]["sources"])
    reasoning_row.append(df[col][0]["reasoning"])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame(
    [answer_row, source_row, reasoning_row],
    columns=df.columns,
    index=["answer", "source", "reasoning"],
)
structured_response_df

Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Consequences of Erudite Vernacular Utilized Ir...,"The paper explores the tendency of writers, pa...",2005,Daniel M. Oppenheimer
source,Consequences of Erudite Vernacular Utilized Ir...,SUMMARY Most texts on writing style encourage ...,Appl. Cognit. Psychol. 20: 139–156 (2006) Publ...,"DANIEL M. OPPENHEIMER* Princeton University, USA"
reasoning,The title is explicitly stated in the retrieve...,The summary captures the essence of the resear...,The paper was published online in October 2005...,The author's name is specifically mentioned in...
