In [3]:
import os
import chromadb
import pickle
import pytesseract as pt
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.summarize import load_summarize_chain

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

pt.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [16]:
class ChatDoc:
    def __init__(self, fileName: str):
        self.fileName = fileName
        self.init()

    def load_from_pdf(self):
        loader = UnstructuredPDFLoader(f"../data/{self.fileName}.pdf")
        self.data = loader.load()
        self.save_to_pkl()

    def load_from_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "rb") as f:
            self.data = pickle.load(f)

    def save_to_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "wb") as f:
            pickle.dump(self.data, f)

    def init(self):
        if os.path.exists(f"../data/chroma/{self.fileName}"):
            self.docsearch = Chroma(persist_directory=f"../data/chroma/{self.fileName}", embedding_function=embeddings, collection_name=self.fileName)
            return
        
        if os.path.exists(f"../data/{self.fileName}.pkl"):
            self.load_from_pkl()
        else:
            self.load_from_pdf()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=0
        )
        self.texts = self.text_splitter.split_documents(self.data)

        self.docsearch = Chroma.from_texts(
            [t.page_content for t in self.texts],
            embeddings,
            collection_name=self.fileName,
            persist_directory=f"../data/chroma/{self.fileName}",
        )

        self.docsearch.persist()

    def query(self, q: str):
        docs = self.docsearch.similarity_search(q, include_metadata=True)
        chain = load_qa_chain(llm, chain_type="stuff")
        return str(chain.run(input_documents=docs, question=q)).strip()

    def summarize(self):
        chain = load_summarize_chain(llm, chain_type="map_reduce")
        return str(chain.run(self.texts[:5])).strip()


In [21]:
deepfake = ChatDoc("deepfake")

Using embedded DuckDB with persistence: data will be stored in: ../data/chroma/deepfake


In [17]:
lecture_notes = ChatDoc("lecture_notes")

Using embedded DuckDB with persistence: data will be stored in: ../data/chroma/lecture_notes


In [13]:
lecture_notes.summarize()

'Computer Networks and Security (2IC60) is a course taught by T. Ozcelebi and J.I. den Hartog. This document provides a collection of course material, including lecture slides, lecture notes, suggested reading, and homework exercises. It covers topics such as push behind networks, standards and regulations, physical infrastructure, the Internet, the Internet of Things, network security goals, threats, security engineering, network protocols, protocol layers, network protocol stacks, services of a protocol layer, and performance.'

In [31]:
lecture_notes.query("""
How does the average transmission delay affect the average queuing delay?
Consider busy routers with a lot of traffic and explain. (Not asking for a
mathematical relation.)
""")


'The average transmission delay affects the average queuing delay by increasing the amount of time it takes for packets to be transmitted through the router. This can lead to an increase in the amount of time packets spend in the queue, as the router is unable to process them as quickly as they arrive. This can be especially noticeable in busy routers with a lot of traffic, as the transmission delay can cause the queue to become congested, leading to an increase in the average queuing delay.'

In [22]:
deepfake.query("Give me a summary of this article")

"This article discusses how AI is becoming more accessible and how it could be used to distort reality and affect the 2024 presidential election. It mentions how AI-generated audio, images, and videos of candidates could become a major threat and blur the line between what's real and what's fake."