In [None]:
import langchain
from langchain.embeddings import CacheBackedEmbeddings,HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore
from langchain.retrievers import BM25Retriever,EnsembleRetriever
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.llms import HuggingFacePipeline
from langchain.cache import InMemoryCache
from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler
from langchain import PromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.docstore.document import Document
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
# bring in our LLAMA_CLOUD_API_KEY
from dotenv import load_dotenv
import asyncio
import os
import nest_asyncio
nest_asyncio.apply()
load_dotenv()


In [None]:
# set up llama parser
parser = LlamaParse(
    result_type="markdown"  # "markdown" and "text" are available
)
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['files/report.pdf'], file_extractor=file_extractor).load_data()
print(documents)

In [None]:
# create vector db
docs = [Document(page_content=documents[i].text,metadata=documents[i].metadata) for i in range(len(documents))]
store = LocalFileStore("./cache/")
embed_model_id = 'BAAI/bge-small-en-v1.5'
core_embeddings_model = HuggingFaceEmbeddings(model_name=embed_model_id)
embedder = CacheBackedEmbeddings.from_bytes_store(core_embeddings_model,
                                                  store,
                                                  namespace=embed_model_id)
# Create VectorStore
vectorstore = FAISS.from_documents(docs,embedder)

In [None]:
query = "what are risk factors?"
embedding_vector = core_embeddings_model.embed_query(query)
print(len(embedding_vector))
#
docs_resp = vectorstore.similarity_search_by_vector(embedding_vector,k=5)
#
for page in docs_resp:
  print(page.page_content)
  print("\n")

In [5]:
# Check to see how much time CacheBackedEmbeddings pattern saves
query = "what are risk factors?"
#
embedding_vector = core_embeddings_model.embed_query(query)
docs_resp = vectorstore.similarity_search_by_vector(embedding_vector,k=5)

In [6]:
# Setup Ensemble Retriever (Hybrid Search)
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k=5
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k":5})
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever,faiss_retriever],
                                       weights=[0.5,0.5])

In [None]:
# Download the quantized GPTQ Model
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-8bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=True,
                                             revision="gptq-4bit-32g-actorder_True")
#
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

In [None]:
langchain.llm_cache = InMemoryCache()
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

In [17]:
# prompt template
PROMPT_TEMPLATE = '''
You are my financial advisor. You are great at providing analyze on financial reports with your knowledge in finances.
With the information being provided try to answer the question. 
If you cant answer the question based on the information either say you cant find an answer or unable to find an answer.
So try to understand in depth about the context and answer only based on the information provided. Dont generate irrelevant answers

Context: {context}
Question: {question}
Do provide only helpful answers

Helpful answer:
'''
#
input_variables = ['context', 'question']
#
custom_prompt = PromptTemplate(template=PROMPT_TEMPLATE,
                            input_variables=input_variables)

In [18]:
# Retrieval chain — without Hybrid Search
handler = StdOutCallbackHandler()
qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = vectorstore.as_retriever(search_kwargs={"k":5}),
    verbose=True,
    callbacks=[handler],
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

In [45]:
def process_output(text):
    text = text.split('Helpful answer:')
    return text[-1].strip()

In [52]:
query = "what is the Revenue for 2023?"
response = qa_with_sources_chain({"query":query})
print(f"Response generated : \n {process_output(response['result'])}")
print('####################################################')
print(f"Source Documents : \n {response['source_documents'][0].page_content}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Response generated : 
 The revenue for 2023 was $13.8 billion.
####################################################
Source Documents : 
 # Table of Contents

# Revenue Recognition

presented net of the taxes that are collected from members and remitted to governmental authorities. The Company is the principal in all its relationships where partners, including consumer electronics ("CE") manufacturers, multichannel video programming distributors ("MVPDs"), mobile operators and internet service providers ("ISPs"), provide access to the service as the Company retains control over service delivery to its members. In circumstances in which the price that the member pays is established by a partner and there is no standalone price for the Netflix service (for instance, in a bundle), the net amount collected from the partner is recognized as revenue.

The Company also earns revenue from advertisements presented on its st

In [48]:
#Setup Retrieval chain — with Hybrid Search
handler = StdOutCallbackHandler()
qa_with_sources_chain_hb = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever = ensemble_retriever,
    callbacks=[handler],
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

In [49]:
%%time
query = "what is the Revenue for 2022?"
response = qa_with_sources_chain_hb({"query":query})
print(f"Response generated : \n {process_output(response['result'])}")
print('####################################################')
print(f"Source Documents : \n {response['source_documents'][0].page_content}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Response generated : 
 The revenue for 2022 was $11.0 billion.
####################################################
Source Documents : 
 # Table of Contents

# Revenue Recognition

presented net of the taxes that are collected from members and remitted to governmental authorities. The Company is the principal in all its relationships where partners, including consumer electronics ("CE") manufacturers, multichannel video programming distributors ("MVPDs"), mobile operators and internet service providers ("ISPs"), provide access to the service as the Company retains control over service delivery to its members. In circumstances in which the price that the member pays is established by a partner and there is no standalone price for the Netflix service (for instance, in a bundle), the net amount collected from the partner is recognized as revenue.

The Company also earns revenue from advertisements presented on its st

In [50]:
%%time
query = "what is the Revenue on the financial year 2022 from UCAN?"
response = qa_with_sources_chain_hb({"query":query})
print(f"Response generated : \n {process_output(response['result'])}")
print('####################################################')
print(f"Source Documents : \n {response['source_documents'][0].page_content}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Response generated : 
 The table shows that the revenue generated from UCAN in 2022 was $1,000,000.
####################################################
Source Documents : 
 # Table of Contents

# Interest expense for the year ended December 31, 2023

Interest expense for the year ended December 31, 2023 consisted primarily of $698 million of interest on our Notes. Interest expense for the year ended December 31, 2023 as compared to the year ended December 31, 2022 remained relatively flat.

# Interest and Other Income (Expense)

Interest and other income (expense) consists primarily of foreign exchange gains and losses on foreign currency denominated balances and interest earned on cash, cash equivalents and short-term investments.

|Year Ended December 31,|2023|2022|2021|2023 vs. 2022|
|---|---|---|---|---|
|Interest and other income (expense)|$ (48,772)|$ 337,310|$ 411,214|$ (386,082) (114%)|
|As a percentage o

In [55]:
query = "tell me about NETFLIX"
response = qa_with_sources_chain_hb({"query":query})
print(f"Response generated : \n {process_output(response['result'])}")
print('####################################################')
print(f"Source Documents : \n {response['source_documents'][0].page_content}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Response generated : 
 Netflix is a subscription-based streaming service that offers a wide range of movies, television shows, documentaries, and other original content. The company was founded in 2002 by Reed Hastings and has since grown to become one of the largest streaming platforms in the world, with millions of subscribers in more than 190 countries. Netflix produces and distributes its own original content, as well as licensing content from other studios and networks. The company generates revenue primarily through monthly subscription fees, with additional revenue coming from advertising and merchandise sales.
####################################################
Source Documents : 
 # Table of Contents

teams. We believe this focus helps our employees grow as leaders and well-rounded individuals, and better positions Netflix to operate our global business of providing compelling content to entertain the wo

In [54]:
langchain.llm_cache.clear()