In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(),override=True)
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [3]:
def chunk_data(data, chunk_size=7500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)
    return chunks

In [4]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

In [20]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    # from pinecone import PodSpec
    from pinecone import Pinecone,ServerlessSpec
    from langchain_pinecone import PineconeVectorStore

    
    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"),environment = os.environ.get('PINECONE_ENV'))

        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = PineconeVectorStore.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) )
        

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = PineconeVectorStore.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

In [10]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')
    

In [23]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-4', temperature=0)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer

In [22]:
# Loading the pdf document into LangChain 
data = load_document('sample.pdf')

# Splitting the document into chunks
chunks = chunk_data(data)

# Creating a Chroma vector store using the provided text chunks and embedding model (default is text-embedding-3-small)
vector_store = insert_or_fetch_embeddings('sample',chunks)

Loading sample.pdf
Creating index sample and embeddings ...Ok


In [24]:
# Asking questions
q ='what is the document all about'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'what is the document all about', 'result': "The document is a detailed financial report of Infosys Limited and its subsidiaries. It includes the Condensed Consolidated Financial Statements under Indian Accounting Standards (Ind AS) for the three months and year ended March 31, 2024. The report contains various financial statements such as the balance sheet, statement of profit and loss, statement of changes in equity, and statement of cash flows. It also includes an overview of the company and notes to the interim condensed consolidated financial statements, which provide additional details and explanations about the financial statements. The document also discusses the company's accounting policies, basis of consolidation, use of estimates and judgments, and other financial information such as business combinations, investments, loans, assets, liabilities, income taxes, revenue, expenses, and leases. It also includes information about legal proceedings and commitments."}


In [25]:
print(answer['result'])


The document is a detailed financial report of Infosys Limited and its subsidiaries. It includes the Condensed Consolidated Financial Statements under Indian Accounting Standards (Ind AS) for the three months and year ended March 31, 2024. The report contains various financial statements such as the balance sheet, statement of profit and loss, statement of changes in equity, and statement of cash flows. It also includes an overview of the company and notes to the interim condensed consolidated financial statements, which provide additional details and explanations about the financial statements. The document also discusses the company's accounting policies, basis of consolidation, use of estimates and judgments, and other financial information such as business combinations, investments, loans, assets, liabilities, income taxes, revenue, expenses, and leases. It also includes information about legal proceedings and commitments.


In [28]:
# Load a Chroma vector store from the specified directory (default ./chroma_db) 
# db = load_embeddings_chroma()
q = 'give total Total equity and liabilities of all year'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'give total Total equity and liabilities of all year', 'result': 'The document provided does not contain information on the total equity and liabilities for any year.'}


In [28]:
# def extract_pdf_text(pdf_path):
#     text = ""
#     with open(pdf_path, 'rb') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         for page in pdf_reader.pages:
#             text += page.extract_text()
#     return text

Enter the path to the PDF file:  tsla-20231231-gen.pdf


Extracting text from PDF...
Splitting text into chunks...
Creating embeddings and vector store...


  embeddings = OpenAIEmbeddings(openai_api_key=api_key)
  llm=ChatOpenAI(model="gpt-4", temperature= 0),



PDF QnA system is ready! Type your questions below.



Q:  what is the document all about


A: The document appears to be an Annual Report on Form 10-K, which is a comprehensive report of a company's performance that must be submitted annually to the U.S. Securities and Exchange Commission. It includes information about the company's mission, products, services, and financial performance. The document also discusses the company's governance, including anti-takeover provisions, the rights and powers of the board of directors, and the terms of the company's convertible senior notes. It also includes sections on executive compensation, security ownership, and relationships and transactions.

[Source]: No source...

[Source]: No source...

[Source]: No source...



Q:  give annual report


A: I'm sorry, but as an AI, I don't have the ability to provide the full annual report. The information provided only includes some parts of the report, such as the responsibility of the registrant's certifying officer and the overview and highlights for 2023. For the full annual report, you would need to refer to the original document or source.

[Source]: No source...

[Source]: No source...

[Source]: No source...



Q:  give the financial report of 2022


A: The text provides some information about the financial report of 2022, but it doesn't provide a comprehensive overview. Here's what we know:

- An impairment loss of $204 million was recorded, as well as realized gains of $64 million in connection with converting holdings of digital assets into fiat currency.
- Other expenses of $36 million were recorded during the second quarter of the year ended December 31, 2022, related to employee terminations.
- Interest income for the year 2022 was $297 million.

For a complete financial report of 2022, you should refer to the Annual Report on Form 10-K for fiscal year 2022, which was filed with the Securities and Exchange Commission on January 31, 2023.

[Source]: No source...

[Source]: No source...

[Source]: No source...



Q:  exit


Exiting the QnA system. Goodbye!
