In [2]:
import os
from typing import List

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import (
    ConversationalRetrievalChain,
)
from langchain.chat_models import ChatOpenAI

from langchain.docstore.document import Document
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain.prompts import ChatPromptTemplate


In [2]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [3]:
loader = PyMuPDFLoader(
    "data/meta10k.pdf",
)

documents = loader.load()
text=''
for doc in documents:
    text+=doc.page_content

# save text to file
with open('data/meta10k.txt', 'w') as f:
    f.write(text)

In [3]:
def init():
    loader = PyMuPDFLoader(
        "data/meta10k.pdf",
    )

    documents = loader.load()
    text=''
    for doc in documents:
        text+=doc.page_content

    # Split the text into chunks
    documents = text_splitter.split_documents(documents)

    # Create a Chroma vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    # Create a Chroma vector store
    embeddings = OpenAIEmbeddings()
    docsearch = Qdrant.from_documents(
        documents, embeddings, location=":memory:"
    )

    message_history = ChatMessageHistory()

    memory = ConversationBufferMemory(
        memory_key="chat_history",
        output_key="answer",
        chat_memory=message_history,
        return_messages=True,
    )

    # Create a chain that uses the Chroma vector store
    chain = ConversationalRetrievalChain.from_llm(
        ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
        chain_type="stuff",
        retriever=docsearch.as_retriever(),
        memory=memory,
        return_source_documents=True,
    )
    return chain

chain = init()



  warn_deprecated(
  warn_deprecated(


In [4]:
res = chain.invoke("Who are Meta's 'Directors' (i.e., members of the Board of Directors) ?")
print(res["answer"])

The Directors of Meta as of February 1, 2024, are Robert M. Kimmitt, Sheryl K. Sandberg, Tracey T. Travis, and Tony Xu.


In [5]:
import pandas as pd

# load testset from csv
testset = pd.read_csv("testset1.csv")
testset.head()

Unnamed: 0.1,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,What was the total value of 'Cash and cash equ...,,$41.86B,simple,"[{'source': 'data/meta10k.pdf', 'file_path': '...",True
1,1,"Who are Meta's 'Directors' (i.e., members of t...",,"The Directors of Meta Platforms, Inc. listed o...",simple,"[{'source': 'data/meta10k.pdf', 'file_path': '...",True
2,2,How does the use of a common identifier help i...,['mobile browser on a given day. We do not req...,,simple,"[{'source': 'data/meta10k.pdf', 'file_path': '...",True
3,3,How is net income used in the computation of d...,['The numerators and denominators of the basic...,The net income is used as the numerator in the...,simple,"[{'source': 'data/meta10k.pdf', 'file_path': '...",True
4,4,How does tax effects from share-based compensa...,['Effective Tax Rate Items. Our effective tax ...,The tax effects from share-based compensation ...,simple,"[{'source': 'data/meta10k.pdf', 'file_path': '...",True


In [12]:
test_questions = testset["question"].values.tolist()
test_groundtruths = testset["ground_truth"].values.tolist()

In [7]:
answers = []
contexts = []

for question in test_questions:
  response = chain.invoke({"question" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["source_documents"]])

In [15]:
print(answers)

["The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41.862 billion.", "The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41.862 billion.", "The use of a common identifier can help in attributing multiple user accounts within and across products by providing a unique link or connection between different accounts that belong to the same individual. This common identifier allows for easier tracking and matching of accounts, making it more accurate to estimate the number of unique people using the products. However, in the context provided, it's mentioned that they do not require people to use a common identifier, so they rely on complex techniques, algorithms, and machine learning models to attribute multiple user accounts to individual people.", 'In the computation of diluted EPS, net income is adjusted for the effect of dilutive securities, such as restricted stock units (RSUs) awards under the Equity Incentive Plan. This adjustment 

In [25]:
from datasets import Dataset
# convert test_questions to list of strings
# using list comprehension convert answers to list of strings


dict = {
    "question" : [str(t) for t in test_questions],
    "answer" : [str(answer) for answer in answers],
    "contexts" :contexts,
    "ground_truth" : [str(g) for g in test_groundtruths]
}
response_dataset = Dataset.from_dict(dict)
response_dataset[0]

{'question': "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?",
 'answer': "The total value of 'Cash and cash equivalents' as of December 31, 2023, was $41.862 billion.",
 'contexts': ['Inputs\n(Level 3)\nCash\n$\n6,265\xa0\nCash equivalents:\nMoney market funds\n32,910\xa0\n$\n32,910\xa0\n$\n—\xa0\n$\n—\xa0\nU.S. government and agency securities\n2,206\xa0\n2,206\xa0\n—\xa0\n—\xa0\nTime deposits\n261\xa0\n—\xa0\n261\xa0\n—\xa0\nCorporate debt securities\n220\xa0\n—\xa0\n220\xa0\n—\xa0\nTotal cash and cash equivalents\n41,862\xa0\n35,116\xa0\n481\xa0\n—\xa0\nMarketable securities:\nU.S. government securities\n8,439\xa0\n8,439\xa0\n—\xa0\n—\xa0\nU.S. government agency securities\n3,498\xa0\n3,498\xa0\n—\xa0\n—\xa0\nCorporate debt securities\n11,604\xa0\n—\xa0\n11,604\xa0\n—\xa0\nTotal marketable securities\n23,541\xa0\n11,937\xa0\n11,604\xa0\n—\xa0\nRestricted cash equivalents\n857\xa0\n857\xa0\n—\xa0\n—\xa0\nOther assets\n101\xa0\n—\xa0\n—\xa0\n101\xa0\

In [26]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

results = evaluate(response_dataset, metrics)

Evaluating:  10%|█         | 7/70 [00:05<00:44,  1.40it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 70/70 [00:26<00:00,  2.66it/s]


In [27]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was the total value of 'Cash and cash equ...,The total value of 'Cash and cash equivalents'...,"[Inputs\n(Level 3)\nCash\n$\n6,265 \nCash equi...",$41.86B,1.0,1.0,1.0,0.833333,0.712338
1,"Who are Meta's 'Directors' (i.e., members of t...",The total value of 'Cash and cash equivalents'...,"[Inputs\n(Level 3)\nCash\n$\n6,265 \nCash equi...","The Directors of Meta Platforms, Inc. listed o...",,0.730223,1.0,0.75,0.172698
2,How does the use of a common identifier help i...,The use of a common identifier can help in att...,[mobile browser on a given day. We do not requ...,,1.0,0.956422,1.0,1.0,0.177512
3,How is net income used in the computation of d...,"In the computation of diluted EPS, net income ...","[stock, while the diluted EPS of Class B commo...",The net income is used as the numerator in the...,1.0,0.917511,1.0,1.0,0.864918
4,How does tax effects from share-based compensa...,Tax effects from share-based compensation can ...,"[that period, our effective tax rate may be in...",The tax effects from share-based compensation ...,1.0,0.96596,1.0,1.0,0.983733
5,How is earnings per share computed using the t...,Earnings per share (EPS) using the two-class m...,[Table of Contents\nNote 4. Earnings per Share...,Earnings per share (EPS) using the two-class m...,1.0,0.985632,1.0,1.0,0.456826
6,How does a decrease in user engagement affect ...,A decrease in user engagement can adversely af...,[Our advertising revenue can also be adversely...,A decrease in user engagement can adversely af...,1.0,0.991471,1.0,1.0,0.486427
7,What is the role of a PCAOB-registered public ...,A PCAOB-registered public accounting firm play...,"[We also have audited, in accordance with the ...",The role of a PCAOB-registered public accounti...,1.0,0.99439,1.0,1.0,0.842494
8,How are foreign currency transaction gains and...,Foreign currency transaction gains and losses ...,"[cumulative translation losses, net of tax of ...",Foreign currency transaction gains and losses ...,1.0,0.990724,1.0,0.75,0.749497
9,How does acting in the best interests of stock...,Acting in the best interests of stockholders i...,[jurisdictions remain ongoing and could subjec...,"Mr. Zuckerberg, as a stockholder, is entitled ...",1.0,0.885007,1.0,0.5,0.884206


In [28]:
results_df.to_csv("baseline.csv", index=False)

### Now let's improve by chaining MultiQueryRetriever

In [56]:
from langchain.retrievers import MultiQueryRetriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

def init2():
    loader = PyMuPDFLoader(
        "data/meta10k.pdf",
    )

    documents = loader.load()
    text=''
    for doc in documents:
        text+=doc.page_content
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)

    # Split the text into chunks
    documents = text_splitter.split_documents(documents)

    # Create a Chroma vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    # Create a Chroma vector store
    docsearch = Qdrant.from_documents(
        documents, embeddings, location=":memory:"
    )

    primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True)

    template = """Answer the question based only on the following context. 
    When answering about financial information, look for content only in table form.

    Context:
    {context}

    Question:
    {input}
    """

    prompt = ChatPromptTemplate.from_template(template)

    advanced_retriever = MultiQueryRetriever.from_llm(retriever=docsearch.as_retriever(), llm=primary_qa_llm)
    document_chain = create_stuff_documents_chain(primary_qa_llm, prompt)
    retrieval_chain = create_retrieval_chain(advanced_retriever, document_chain)

    return retrieval_chain

chain2 = init2()



In [63]:
res = chain2.invoke({"input" : "Who are Meta's 'Directors' (i.e., members of the Board of Directors) on the signature page?"})

print(res["answer"])

The Directors of Meta on the signature page are:
- Aaron Anderson
- Peggy Alford
- Marc L. Andreessen
- Andrew W. Houston
- Nancy Killefer
- Robert M. Kimmitt
- Sheryl K. Sandberg
- Tracey T. Travis
- Tony Xu


In [58]:
for source in res["context"]:
    print(source.page_content)
    print('------------------')


Table of Contents
SIGNATURES
Pursuant to the requirements of Section 13 or 15(d) of the Securities Exchange Act of 1934, the Registrant has duly caused this Annual Report on Form
10-K to be signed on its behalf by the undersigned, thereunto duly authorized, in the City of Menlo Park, State of California, on this 1st day of February 2024.
META PLATFORMS, INC.
Date:
February 1, 2024
/s/ Susan Li 
Susan Li
Chief Financial Officer
 
131
------------------
Table of Contents
PART III
Item 10. Directors, Executive Officers and Corporate Governance
The information required by this item is incorporated by reference to our Proxy Statement for the 2024 Annual Meeting of Stockholders to be filed with
the SEC within 120 days of the fiscal year ended December 31, 2023.
Our board of directors has adopted a Code of Conduct applicable to all officers, directors, and employees, which is available on our website
------------------
on the Nasdaq Global Select Market under the symbol "META." Our principal 

In [59]:
res = chain2.invoke({"input" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})

print(res["answer"])

The total value of 'Cash and cash equivalents' as of December 31, 2023 was $41,381 million.


In [65]:
answers = []
contexts = []

for question in test_questions:
  response = chain2.invoke({"input" : question})
  print(response["answer"])
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

The total value of 'Cash and cash equivalents' as of December 31, 2023 was $41,381 million.
The information required to answer this question is not present in the provided context.
Based on the context provided, the use of a common identifier is not required to attribute multiple user accounts within and across products in the Family metrics. The calculations rely on complex techniques, algorithms, and machine learning models to estimate the number of unique people using the products, including matching user accounts without the need for a common identifier.
Net income for the computation of diluted EPS is adjusted by the effect of dilutive securities, including restricted stock units (RSUs) awards under the Equity Incentive Plan. Additionally, the net income is reallocated as a result of the conversion of Class B to Class A common stock.
Based on the context provided, the tax effects from share-based compensation can impact the effective tax rate significantly, sometimes from period t

In [20]:
test_groundtruths[0]=''

In [66]:
from datasets import Dataset

dict = {
    "question" : [str(t) for t in test_questions],
    "answer" : [str(answer) for answer in answers],
    "contexts" :contexts,
    "ground_truth" : [str(g) for g in test_groundtruths]
}
response_dataset = Dataset.from_dict(dict)
response_dataset[0]

{'question': "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?",
 'answer': "The total value of 'Cash and cash equivalents' as of December 31, 2023 was $41,381 million.",
 'contexts': ['Table of Contents\n\xa0\n\xa0\nFair Value Measurement at Reporting Date Using\nDescription\nDecember 31, 2022\nQuoted\xa0Prices in Active\nMarkets for\nIdentical\xa0Assets\n(Level 1)\nSignificant Other\nObservable Inputs\n(Level 2)\nSignificant Unobservable\nInputs\n(Level 3)\nCash\n$\n6,176\xa0\nCash equivalents:\nMoney market funds\n8,305\xa0\n$\n8,305\xa0\n$\n—\xa0\n$\n—\xa0\nU.S. government and agency securities\n16\xa0\n16\xa0\n—\xa0\n—\xa0\nTime deposits\n156\xa0\n—\xa0\n156\xa0\n—\xa0\nCorporate debt securities\n28\xa0\n—\xa0\n28\xa0\n—\xa0\nTotal cash and cash equivalents\n14,681\xa0\n8,321\xa0\n184\xa0\n—\xa0\nMarketable securities:\nU.S. government securities\n8,708\xa0\n8,708\xa0\n—\xa0\n—',
  '$\n42,827\xa0\n$\n15,596\xa0\n$\n16,865\xa0\nReconciliation of cash

In [67]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

results = evaluate(response_dataset, metrics)
results_df = results.to_pandas()
results_df

Evaluating:   0%|          | 0/70 [00:00<?, ?it/s]No statements were generated from the answer.
Evaluating:  39%|███▊      | 27/70 [00:11<00:11,  3.65it/s]No statements were generated from the answer.
Evaluating: 100%|██████████| 70/70 [00:32<00:00,  2.18it/s]


Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What was the total value of 'Cash and cash equ...,The total value of 'Cash and cash equivalents'...,[Table of Contents\n \n \nFair Value Measureme...,$41.86B,0.0,1.0,1.0,0.866667,0.710426
1,"Who are Meta's 'Directors' (i.e., members of t...",The information required to answer this questi...,[on the Nasdaq Global Select Market under the ...,"The Directors of Meta Platforms, Inc. listed o...",,0.0,1.0,0.926667,0.174213
2,How does the use of a common identifier help i...,"Based on the context provided, the use of a co...",[therefore must seek to attribute multiple use...,,1.0,0.834123,1.0,0.0,0.178565
3,How is net income used in the computation of d...,Net income for the computation of diluted EPS ...,"[the calculation of diluted EPS, net income fo...",The net income is used as the numerator in the...,1.0,0.947057,1.0,1.0,0.823252
4,How does tax effects from share-based compensa...,"Based on the context provided, the tax effects...",[the tax rate without such benefits. In future...,The tax effects from share-based compensation ...,1.0,0.994948,1.0,0.833333,0.535367
5,How is earnings per share computed using the t...,Earnings per share is computed using the two-c...,[Table of Contents\nNote 4. Earnings per Share...,Earnings per share (EPS) using the two-class m...,1.0,0.92742,1.0,1.0,0.617972
6,How does a decrease in user engagement affect ...,A decrease in user engagement could render our...,"[reduce our ability to effectively target ads,...",A decrease in user engagement can adversely af...,1.0,0.94374,1.0,0.966667,0.738696
7,What is the role of a PCAOB-registered public ...,"Based on the context provided, the role of a P...",[to express an opinion on the Company's intern...,The role of a PCAOB-registered public accounti...,1.0,1.0,1.0,1.0,0.619468
8,How are foreign currency transaction gains and...,Foreign currency transaction gains and losses ...,[losses are recorded in accumulated other comp...,Foreign currency transaction gains and losses ...,1.0,0.990729,1.0,0.97619,0.749497
9,How does acting in the best interests of stock...,This question cannot be answered based on the ...,[fiduciary duty to our stockholders and must a...,"Mr. Zuckerberg, as a stockholder, is entitled ...",,0.0,1.0,1.0,0.185185


In [68]:
results_df.to_csv("advanced.csv", index=False)

In [69]:
baseline = pd.read_csv("baseline.csv")
advanced = pd.read_csv("advanced.csv")
# merge the two dataframes
merged = pd.merge(baseline, advanced, on="question", suffixes=("_baseline", "_advanced"))

# from the baseline, get columns named "faithfulness", "answer_relevancy", "context_recall", "context_precision", "answer_correctness"
metrics = ["faithfulness", "answer_relevancy", "context_recall", "context_precision", "answer_correctness"]
baseline_columns = [f"{metric}_baseline" for metric in metrics]
# from the advanced, get columns named "faithfulness", "answer_relevancy", "context_recall", "context_precision", "answer_correctness"
advanced_columns = [f"{metric}_advanced" for metric in metrics]

# convert columns to rows in baseline_columns
delta = pd.DataFrame()
for metric in metrics:
    delta[metric] = merged[f"{metric}_advanced"] - merged[f"{metric}_baseline"]
delta.mean()

# calculate the difference between the advanced and baseline





faithfulness         -0.083333
answer_relevancy     -0.122468
context_recall        0.000000
context_precision    -0.018844
answer_correctness   -0.084354
dtype: float64