In [18]:
import os
import chromadb
import pickle
import pytesseract as pt
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.summarize import load_summarize_chain

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
llm = OpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)

pt.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [20]:
class ChatDoc:
    def __init__(self, fileName: str):
        self.fileName = fileName
        self.init()

    def load_from_pdf(self):
        loader = UnstructuredPDFLoader(f"../data/{self.fileName}.pdf")
        self.data = loader.load()
        self.save_to_pkl()

    def load_from_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "rb") as f:
            self.data = pickle.load(f)

    def save_to_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "wb") as f:
            pickle.dump(self.data, f)

    def init(self):
        if os.path.exists(f"../data/chroma/{self.fileName}"):
            self.docsearch = Chroma(persist_directory=f"../data/chroma/{self.fileName}", embedding_function=embeddings, collection_name=self.fileName)
            return
        
        if os.path.exists(f"../data/{self.fileName}.pkl"):
            self.load_from_pkl()
        else:
            self.load_from_pdf()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=0
        )
        self.texts = self.text_splitter.split_documents(self.data)

        self.docsearch = Chroma.from_texts(
            [t.page_content for t in self.texts],
            embeddings,
            collection_name=self.fileName,
            persist_directory=f"../data/chroma/{self.fileName}",
        )

        self.docsearch.persist()

    def query(self, q: str):
        docs = self.docsearch.similarity_search(q, include_metadata=True)
        chain = load_qa_chain(llm, chain_type="stuff")
        return str(chain.run(input_documents=docs, question=q)).strip()

    # def summarize(self):
    #     chain = load_summarize_chain(llm, chain_type="map_reduce")
    #     return str(chain.run(self.docsearch.as_retriever().)).strip()


In [9]:
deepfake = ChatDoc("deepfake")

Using embedded DuckDB with persistence: data will be stored in: ../data/chroma/deepfake


In [12]:
deepfake.query("Summarize the document?")

'This document discusses potential solutions to the problem of deepfakes, which are AI-generated images and videos that can be used to spread false information. One solution is C2PA, which cryptographically signs any content created by a device and documents who captured the image, where, and when. Other options include fingerprinting and watermarking images and videos. AI tools have been proposed to spot deepfakes, but this technology is not reliable enough to keep up with the constantly changing generative AI tools. Another potential solution is to develop an instant fact-checker for social media users, which would quickly inform users of the veracity of a piece of content.'

In [21]:
asml = ChatDoc("asml")

Using embedded DuckDB with persistence: data will be stored in: ../data/chroma/asml


In [4]:
asml.query("Who is the author of the book?")

'The author of the book is René Raaijmakers.'

In [7]:
asml.query("When was ASML founded")

'ASML was founded in 1984.'

In [8]:
asml.query("What is the technology used by ASML?")

'ASML uses lithography technology, electric tables, superior alignment technology, Zeiss lenses, and digital controllers.'

In [9]:
asml.query("Who were the first engineers at ASML?")

'The first engineers at ASML were Werf, Willekens, and Wittekoek.'

In [11]:
asml.query("What is the name of the first machine developed by ASML?")

'The first machine developed by ASML is the PAS 5500.'

In [12]:
asml.query("Where is the headquarter of ASML?")

'The headquarters of ASML is located in Veldhoven, Netherlands.'

In [14]:
asml.query("Who is the first CEO of ASML?")

'Gjalt Smit'

In [15]:
asml.query("What were the challenges faced by ASML?")

'ASML faced challenges with their dependence on Zeiss, their production struggles, their operational expenses, and their difficulty in finding potential customers.'

In [17]:
asml.query("Explain the technology used by ASML with simple words.")

"ASML uses a combination of a research team from Philips, a dedicated product development team, and a precision alignment system to create machines that can print very small structures on wafers at a very high speed. The electric table, superior alignment technology, and Zeiss lenses are the technological foundation of ASML's machines."

In [23]:
asml.query("Explain the technology used by ASML as if I was 6 years old.")

'ASML makes machines that help make tiny parts for computers. The machines use a special electric table and lenses from a company called Zeiss to make the parts quickly and accurately.'

In [24]:
asml.query("What are the key milestone of ASML between 1984 and 1995?")

'In 1984, ASML launched a marketing campaign for their PAS 2000 steppers. In 1985, they launched a second campaign in the business press and started selling PAS 2500 steppers. They also set up a demo lab in Phoenix and a sales office in 1985. In 1991, the PAS 5500 was introduced and the machine started running reliably a year later. In 1992, their annual revenue jumped from $81 million to $119 million. In 1993, their annual revenue increased by 50%, and they turned a profit of $11 million. In 1996, they introduced their DUV version which can image quarter-micron details.'

In [25]:
asml.query("What are the key milestone of ASML between 1984 and 1995? Print the result as a JSON file")

'{1984: "ASML\'s first business plan is a political document.", 1986: "ASML and IBM start talking about step-and-scan.", 1992: "In 1992 the company’s annual revenue jumped from $81 million to $119 million.", 1993: "In early 1993 everything’s largely under control; ASML has a reasonably reliable machine.", 1996: "At the start of 1996 ASML is financially independent at last."}'

In [26]:
asml.query("What are the key milestone of ASML between 1984 and 1995? Print the result as bullet list in markdown format")

'- In 1984, ASML sets up a demo lab in Phoenix\n- In 1985, ASML launches two marketing campaigns and sets up a sales office\n- In 1986, ASML turns a profit and sets up a production facility in Silicon Valley\n- In 1988, ASML sets up a production facility and a second demo lab in Japan\n- In 1989, ASML pays off all its debt\n- By the end of 1988, ASML is among the top three global suppliers of optical wafer steppers'

In [27]:
asml.query("Provide a summary of the document in 3 sentences")

'This document details the experiences of Wim Troost and Gjalt Smit in their efforts to run their companies. It covers the struggles they faced and their successes, as well as the strategies they used to help their businesses succeed. It also talks about the impact of other people on their plans, such as the Hay consultant hired by Smit to take stock of the situation.'