In [1]:
import chromadb
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from Template import css, bot_template, user_template
from langchain_community.chat_models import ChatOllama
from langchain_chroma import Chroma

In [14]:
path="./documents"

import glob, os
os.chdir(path)
pdf_docs = []
for file in glob.glob("*.pdf"):
   print(file)
   pdf_docs.append(file)

text = ""
for pdf in pdf_docs:
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
        text += page.extract_text()

goog-10-k-2023 (1).pdf
tsla-20231231-gen.pdf
uber-10-k-2023.pdf


In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader

# Directory containing the PDF files
pdf_directory = r"C:\Users\Aman Yunus Badure\Desktop\alemeno\documents"

# Get a list of all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

# Initialize an empty list to hold the pages from all PDFs
all_pages = []

# Iterate through the list of PDF files and load them
for pdf_file in pdf_files:
    # Create the full file path
    pdf_path = os.path.join(pdf_directory, pdf_file)
    
    # Load and split the PDF
    loader = PyPDFLoader(pdf_path)
    pages = loader.load_and_split()
    
    # Append the pages to the all_pages list
    all_pages.extend(pages)

# Now all_pages contains the pages from all the PDFs
print(f"Loaded {len(all_pages)} pages from {len(pdf_files)} PDFs.")

Loaded 562 pages from 3 PDFs.


In [17]:
all_pages[20].page_content

'that are seen to be, substantially similar to or better than ours. These technologies could reduce usage of our products \nand services, and force us to compete in different ways and expend significant resources to develop and operate equal \nor better products and services. Competitors’ success in providing compelling products and services or in attracting \nand retaining users, advertisers, publishers, customers, and content providers could harm our financial condition and \noperating results. \nOur ongoing investment in new businesses, products, services, and technologies  is inherently risky, and \ncould divert management attention and harm our business, financial condition, and operating results. \nWe have invested and expect to continue to invest in new businesses, products, services,  and technologies in a \nwide range of industries beyond online advertising. The investments that we are making across our businesses, such \nas building AI capabilities into new and existing produ

In [3]:
text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
text_chunks = text_splitter.split_documents(all_pages)

In [13]:
from sentence_transformers import SentenceTransformer
modelPath = "./embedding_model"

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model.save(modelPath)
embeddings = SentenceTransformer(modelPath)

  from tqdm.autonotebook import tqdm, trange


In [20]:
from sentence_transformers import SentenceTransformer
from typing import List


class SentenceEmbeddings:
    def __init__(self, modelPath):
        self.modelPath=modelPath
        self.model = SentenceTransformer(self.modelPath)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(t).tolist() for t in texts]
        
    def embed_query(self, texts: str) -> List[float]:
        return self.embed_documents([texts])[0]

modelPath = "./embedding_model"
embeddings = SentenceEmbeddings(modelPath)

In [9]:
client = chromadb.PersistentClient(path="./vectorDB")
collection = client.get_or_create_collection(name="PDF_Docs_2")

db = Chroma.from_documents(
    documents=text_chunks,
    collection_name="PDF_Docs_2",
    embedding=embeddings,
    persist_directory="./vectorDB",
)


In [10]:
llm = ChatOllama(model="llama3")
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=db.as_retriever(),
        memory=memory
    )

In [11]:
conversation_chain("What is the total revenue for Google Search??")

{'question': 'What is the total revenue for Google Search??',
 'chat_history': [HumanMessage(content='What is the total revenue for Google Search??'),
  AIMessage(content='According to the provided information, the total revenue for Google Search & other properties is:\n\nYear Ended December 31,\n2022: $162,450 million\n2023: $175,033 million\n\nSo, the total revenue increase from 2022 to 2023 is $12.6 billion ($175,033 - $162,450).')],
 'answer': 'According to the provided information, the total revenue for Google Search & other properties is:\n\nYear Ended December 31,\n2022: $162,450 million\n2023: $175,033 million\n\nSo, the total revenue increase from 2022 to 2023 is $12.6 billion ($175,033 - $162,450).'}

In [12]:
conversation_chain("What are the risk factors associated with Google and Tesla?")

{'question': 'What are the risk factors associated with Google and Tesla?',
 'chat_history': [HumanMessage(content='What is the total revenue for Google Search??'),
  AIMessage(content='According to the provided information, the total revenue for Google Search & other properties is:\n\nYear Ended December 31,\n2022: $162,450 million\n2023: $175,033 million\n\nSo, the total revenue increase from 2022 to 2023 is $12.6 billion ($175,033 - $162,450).'),
  HumanMessage(content='What are the risk factors associated with Google and Tesla?'),
  AIMessage(content="I don't know the answer because the provided text is not related to Google, but rather it appears to be a risk factor disclosure for Tesla's securities filings. If you're looking for information on risk factors associated with Google (Alphabet Inc.), I can try to find that information for you!")],
 'answer': "I don't know the answer because the provided text is not related to Google, but rather it appears to be a risk factor disclosur

In [14]:
conversation_chain("What are the risk factors associated with Tesla?")

{'question': 'What are the risk factors associated with Tesla?',
 'chat_history': [HumanMessage(content='What is the total revenue for Google Search??'),
  AIMessage(content='According to the provided information, the total revenue for Google Search & other properties is:\n\nYear Ended December 31,\n2022: $162,450 million\n2023: $175,033 million\n\nSo, the total revenue increase from 2022 to 2023 is $12.6 billion ($175,033 - $162,450).'),
  HumanMessage(content='What are the risk factors associated with Google and Tesla?'),
  AIMessage(content="I don't know the answer because the provided text is not related to Google, but rather it appears to be a risk factor disclosure for Tesla's securities filings. If you're looking for information on risk factors associated with Google (Alphabet Inc.), I can try to find that information for you!"),
  HumanMessage(content='Yes I am looking for Google (Alphabet Inc.) and Tesla, Now answer the question :What are the risk factors associated with Googl

In [15]:
conversation_chain("What are the risk factors associated with Google that is Alphabet Inc. ?")

{'question': 'What are the risk factors associated with Google that is Alphabet Inc. ?',
 'chat_history': [HumanMessage(content='What is the total revenue for Google Search??'),
  AIMessage(content='According to the provided information, the total revenue for Google Search & other properties is:\n\nYear Ended December 31,\n2022: $162,450 million\n2023: $175,033 million\n\nSo, the total revenue increase from 2022 to 2023 is $12.6 billion ($175,033 - $162,450).'),
  HumanMessage(content='What are the risk factors associated with Google and Tesla?'),
  AIMessage(content="I don't know the answer because the provided text is not related to Google, but rather it appears to be a risk factor disclosure for Tesla's securities filings. If you're looking for information on risk factors associated with Google (Alphabet Inc.), I can try to find that information for you!"),
  HumanMessage(content='Yes I am looking for Google (Alphabet Inc.) and Tesla, Now answer the question :What are the risk facto

In [18]:
conversation_chain("What are the differences in the business of Tesla and Uber?")

{'question': 'What are the differences in the business of Tesla and Uber?',
 'chat_history': [HumanMessage(content='What is the total revenue for Google Search??'),
  AIMessage(content='According to the provided information, the total revenue for Google Search & other properties is:\n\nYear Ended December 31,\n2022: $162,450 million\n2023: $175,033 million\n\nSo, the total revenue increase from 2022 to 2023 is $12.6 billion ($175,033 - $162,450).'),
  HumanMessage(content='What are the risk factors associated with Google and Tesla?'),
  AIMessage(content="I don't know the answer because the provided text is not related to Google, but rather it appears to be a risk factor disclosure for Tesla's securities filings. If you're looking for information on risk factors associated with Google (Alphabet Inc.), I can try to find that information for you!"),
  HumanMessage(content='Yes I am looking for Google (Alphabet Inc.) and Tesla, Now answer the question :What are the risk factors associated