In [None]:
!pip install pypdf pypdf[full] pymupdf llama-index-core llama-parse llama-index-readers-file langchain-huggingface marker-pdf

### Pdf Parsing

In [9]:
from pypdf import PdfReader
reader = PdfReader("report.pdf")
page = reader.pages[3]
print(page.extract_text())

Table of Contents
PART I
Forward-Looking Statements
This Annual Report on Form 10-K contains forward-looking statements within the meaning of the federal securities laws. These forward-looking
statements include, but are not limited to, statements regarding: our core strategy; our ability to improve our content offerings and service; our future financial
performance, including expectations regarding revenues, deferred revenue, operating income and margin, net income, expenses, and profitability; liquidity,
including the sufficiency of our capital resources, net cash provided by (used in) operating activities, access to financing sources, and free cash flows; capital
allocation strategies, including any stock repurchases or repurchase programs; seasonality; stock price volatility; impact of foreign exchange rate fluctuations,
including on net income, revenues and average revenues per paying member; impact of interest rate fluctuations; adequacy of existing facilities; future
regulatory 

In [11]:
documents_naive = []
for i in range(len(reader.pages)):
    doc = reader.pages[i].extract_text(extraction_mode="layout")
    documents_naive.append(doc)

In [16]:
for i in range(len(documents_naive)): print(documents_naive[i]) #documents



Table of Contents



                                                                                  NETFLIX, INC.
                                                                             TABLE OF CONTENTS


                                                                                                                                                                                 Page
PART I
Item 1.          Business                                                                                                                                                              1
Item 1A.         Risk Factors                                                                                                                                                          4
Item 1B.         Unresolved Staff Comments                                                                                                                                            16
Item 1C.         Cybersecurity          

In [None]:
# extract text in a fixed width format that closely adheres to the rendered
# layout in the source pdf
print(page.extract_text(extraction_mode="layout"))

In [None]:
import fitz 
import io
from PIL import Image

# Open the PDF file
pdf_document = "attention.pdf"
pdf = fitz.open(pdf_document)

# Iterate through each page
for page_number in range(len(pdf)):
    page = pdf.load_page(page_number)
    images = page.get_images(full=True)

    # Iterate through each image on the page
    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = pdf.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image = Image.open(io.BytesIO(image_bytes))

        # Save the image
        image_filename = f"page_{page_number + 1}_image_{img_index + 1}.{image_ext}"
        image.save(open(image_filename, "wb"))

        print(f"Saved image: {image_filename}")

pdf.close()


In [1]:
# llama parser using llamacloud api to extract content from pdf
# 1000 pages free usage per day
# llama index
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
import asyncio
import os
import nest_asyncio
nest_asyncio.apply()
load_dotenv()

# set up parser
parser = LlamaParse(
    result_type="markdown"  # "markdown" and "text" are available
)
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['report.pdf'], file_extractor=file_extractor).load_data()
print(documents)


Started parsing the file under job_id 673cbe05-8fb1-42ff-881b-ac2146c8147d
[Document(id_='15c19e2f-f5a3-4999-867b-b9fe205d0b5f', embedding=None, metadata={'file_path': 'report.pdf', 'file_name': 'report.pdf', 'file_type': 'application/pdf', 'file_size': 1073470, 'creation_date': '2024-12-19', 'last_modified_date': '2024-12-19'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='# UNITED STATES SECURITIES AND EXCHANGE COMMISSION\n\n# Washington, D.C. 20549\n\n# FORM 10-K\n\n# (Mark One)\n\n☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the fiscal year ended December 31, 2023\n\nOR\n\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\n\nFor the transition period from     

In [7]:
for i in range(len(documents)):
    print(documents[i].text)
    print("----")
    print(documents[i].metadata)
    print("----")
    print(f"page number: {i+1}")

# UNITED STATES SECURITIES AND EXCHANGE COMMISSION

# Washington, D.C. 20549

# FORM 10-K

# (Mark One)

☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the fiscal year ended December 31, 2023

OR

☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934

For the transition period from                     to

# Commission File Number: 001-35727

# Netflix, Inc.

(Exact name of registrant as specified in its charter)

# Delaware

(State or other jurisdiction of incorporation or organization)

77-0467272

(I.R.S. Employer Identification No.)

# Title of each class

Common stock, par value $0.001 per share

121 Albright Way, Los Gatos, California 95032

(Address and zip code of principal executive offices)

(408) 540-3700

(Registrant’s telephone number, including area code)

# Securities registered pursuant to Section 12(b) of the Act:

|Trading Symbol(s)|Name of each exchange on which registered|
|---|---|
|NF

In [32]:
# marker downloads llm and uses it to extract content from pdf
# it is free to use for non-commercial purpose
from marker.converters.pdf import PdfConverter
from marker.renderers.markdown import MarkdownOutput
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser

config = {
    "output_format":"markdown",
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config= config_parser.generate_config_dict(),
    artifact_dict = create_model_dict(),
    processor_list = config_parser.get_processors(),
    renderer = config_parser.get_renderer()
)

rendered = converter("attention.pdf")
text,_,images = text_from_rendered(rendered)


Loaded layout model datalab-to/surya_layout0 on device cuda with dtype torch.float16
Loaded texify model to cuda with torch.float16 dtype
Loaded recognition model vikp/surya_rec2 on device cuda with dtype torch.float16
Loaded table recognition model vikp/surya_tablerec on device cuda with dtype torch.float16
Loaded detection model vikp/surya_det3 on device cuda with dtype torch.float16


Recognizing layout: 100%|██████████| 3/3 [00:28<00:00,  9.40s/it]
Detecting bboxes: 0it [00:00, ?it/s]
Recognizing equations: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Recognizing tables: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


In [None]:
print(text)

In [None]:
rendered.metadata

In [None]:
rendered.metadata['table_of_contents'][0]['title']

In [None]:
import pdfplumber
import pandas as pd
pdf = pdfplumber.open("report.pdf")
for i in range(len(pdf.pages)):
    page = pdf.pages[i]
    table = page.extract_table()
    if table is not None:
        text = page.extract_text(keep_blank_chars=True)
        print(table)
        print("----")
        print(text)


In [None]:
p0 = pdf.pages[43]
im = p0.to_image()
im

In [None]:
im.reset().debug_tablefinder()

In [None]:
text = p0.extract_text(keep_blank_chars=True,)
print(text)

### Retrieval with fixed size chunking - pypdf

In [17]:
from langchain_openai import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings


# Initialize OpenAI models
llm = ChatOpenAI(model="gpt-4o")
embedding_model = HuggingFaceEmbeddings()
# embedding_model = OpenAIEmbeddings()
# Load and preprocess documents

# Split documents into chunks for vector storage
text_splitter = CharacterTextSplitter(chunk_size=1024, chunk_overlap=20)
docs = text_splitter.create_documents(documents_naive)

# Create a vector database using FAISS
vector_db = FAISS.from_documents(docs, embedding_model)

# Create a conversation chain with retrieval capabilities
retrieval_chain = ConversationalRetrievalChain.from_llm(llm, vector_db.as_retriever())

  embedding_model = HuggingFaceEmbeddings()
Created a chunk of size 1484, which is longer than the specified 1024
Created a chunk of size 1672, which is longer than the specified 1024
Created a chunk of size 2495, which is longer than the specified 1024
Created a chunk of size 1163, which is longer than the specified 1024
Created a chunk of size 1406, which is longer than the specified 1024
Created a chunk of size 3323, which is longer than the specified 1024
Created a chunk of size 1529, which is longer than the specified 1024
Created a chunk of size 1426, which is longer than the specified 1024
Created a chunk of size 2624, which is longer than the specified 1024
Created a chunk of size 1957, which is longer than the specified 1024
Created a chunk of size 1834, which is longer than the specified 1024
Created a chunk of size 2976, which is longer than the specified 1024
Created a chunk of size 1794, which is longer than the specified 1024
Created a chunk of size 1672, which is longer 

In [18]:
from pprint import pprint
response = retrieval_chain.invoke({"question": "how many Cash and cash equivalents in Decemeber 31 2023", "chat_history": []})
pprint(response['answer'])

("I don't have the specific amount for cash and cash equivalents as of "
 'December 31, 2023.')


In [None]:
response = retrieval_chain.invoke({"question": "Property and equipment and accumulated depreciation in Decemeber 31 2023", "chat_history": []})
pprint(response['answer'])

### Retrieval with page chunking - llama parser

In [19]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import TextLoader
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.docstore.document import Document
from pprint import pprint
load_dotenv()

llm = ChatOpenAI(model_name="gpt-4o")

embedding_model = HuggingFaceEmbeddings()

# Create a vector database using FAISS
docs = [Document(page_content=documents[i].text,metadata=documents[i].metadata) for i in range(len(documents))]
vector_db = FAISS.from_documents(docs, embedding_model)

# Create a conversation chain with retrieval capabilities
retrieval_chain_llama = ConversationalRetrievalChain.from_llm(llm, vector_db.as_retriever())

  embedding_model = HuggingFaceEmbeddings()


In [6]:
response = retrieval_chain_llama.invoke({"question": "how many Cash and cash equivalents in Decemeber 31 2023", "chat_history": []})
pprint(response['answer'])

'As of December 31, 2023, the cash and cash equivalents amounted to $5,986,629.'


In [None]:
response = retrieval_chain_llama.invoke({"question": "What is English constituency parsing Results for Zhu et al. (2013)", "chat_history": []})
pprint(response['answer'])

### Retrieval with markdown - marker

In [27]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import TextLoader
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.docstore.document import Document
from pprint import pprint
load_dotenv()

llm = ChatOpenAI(model_name="gpt-4o")
embedding_model = HuggingFaceEmbeddings()
# Create a vector database using FAISS
splited_text = text.split("**")
docs = [Document(page_content=splited_text[i]) for i in range(len(splited_text))]
vector_db = FAISS.from_documents(docs, embedding_model)

# Create a conversation chain with retrieval capabilities
retrieval_chain_maker = ConversationalRetrievalChain.from_llm(llm, vector_db.as_retriever())

  embedding_model = HuggingFaceEmbeddings()


In [28]:
splited_text

['### ',
 'UNITED STATES SECURITIES',
 ' AND ',
 'EXCHANGE COMMISSION Washington, D.C. 20549 _____________________________________________________________________',
 '\n\n### ',
 'FORM 10-K _____________________________________________________________________',
 '\n\n#### ',
 '(Mark One)',
 '\n\n☒ ',
 'ANNUAL REPORT PURSUANT',
 ' TO ',
 'SECTION',
 ' 13 OR ',
 '15(d)',
 ' OF THE ',
 'SECURITIES EXCHANGE',
 ' ACT OF ',
 '1934',
 '\n\nFor the ',
 'fiscal year ended December',
 ' 31, ',
 '2023',
 ' OR\n\n### ☐ ',
 'TRANSITION REPORT PURSUANT',
 ' TO ',
 'SECTION',
 ' 13 OR ',
 '15(d)',
 ' OF THE ',
 'SECURITIES EXCHANGE',
 ' ACT OF ',
 '1934',
 '\n\nFor the ',
 'transition period from',
 ' to\n\n',
 'Commission File Number: 001-35727 _____________________________________________________________________',
 '\n\n# ',
 'Netflix, Inc.',
 '\n\n(Exact name of registrant as specified in its charter) ',
 '_____________________________________________________________________',
 '\n\n(State or othe

In [29]:
response = retrieval_chain_maker.invoke({"question": "how many Cash and cash equivalents in Decemeber 31 2023", "chat_history": []})
pprint(response['answer'])

('As of December 31, 2023, the cash, cash equivalents, and restricted cash '
 'amounted to $7,118,515,000.')


In [None]:
response = retrieval_chain_maker.invoke({"question": "What is English constituency parsing Results for Zhu et al. (2013)", "chat_history": []})
pprint(response['answer'])

### Retrieval with map reduce

In [33]:
from langchain.chains import (
    StuffDocumentsChain, LLMChain, ReduceDocumentsChain
)
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# This controls how each document will be formatted. Specifically,
# it will be passed to `format_document` - see that function for more
# details.
document_prompt = PromptTemplate(
    input_variables=["page_content"],
     template="{page_content}"
)
document_variable_name = "context"
llm = ChatOpenAI(model_name="gpt-4o")
# The prompt here should take as an input variable the
# `document_variable_name`
prompt = PromptTemplate.from_template(
    "Summarize this content: {context}"
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
combine_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name
)
chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
)
# If we wanted to, we could also pass in collapse_documents_chain
# which is specifically aimed at collapsing documents BEFORE
# the final call.
prompt = PromptTemplate.from_template(
    "Collapse this content: {context}"
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
collapse_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name
)
chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=collapse_documents_chain,
)

In [34]:
result = chain.run(question="What is English constituency parsing Results for Zhu et al. (2013)", input_documents=docs)
pprint(result)

("The Netflix 2023 Annual Report Summary highlights the company's strong "
 'financial performance and strategic focus. Netflix, a global entertainment '
 'service with over 260 million subscribers across 190 countries, continues to '
 'prioritize global expansion and user experience enhancement through quality '
 'content and improved interfaces. Despite facing intense competition from '
 'other video providers, gaming, and social media, Netflix increased its '
 'revenue by 7% to $33.7 billion, with an operating income rise of 23% to $7.0 '
 'billion and a 21% operating margin. Membership growth contributed to a 7% '
 'increase in streaming revenues.\n'
 '\n'
 'The company faces several key risks, such as competition, regulatory '
 'changes, intellectual property issues, cybersecurity threats, and financial '
 'risks. Its strategy involves investing in original content and strategic '
 'initiatives while expanding its share repurchase program. Netflix reported a '
 'significant increa

In [None]:
result = chain.run(question="What is the score for Transformer (base model)", input_documents=docs)
pprint(result)