In [1]:
import os
import chromadb
import pickle
import pytesseract as pt
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.summarize import load_summarize_chain

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)

pt.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [2]:
class ChatDoc:
    def __init__(self, fileName: str):
        self.fileName = fileName
        self.init()

    def load_from_pdf(self):
        loader = UnstructuredPDFLoader(f"../data/{self.fileName}.pdf")
        self.data = loader.load()
        self.save_to_pkl()

    def load_from_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "rb") as f:
            self.data = pickle.load(f)

    def save_to_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "wb") as f:
            pickle.dump(self.data, f)

    def init(self):
        if os.path.exists(f"../data/chroma/{self.fileName}"):
            self.docsearch = Chroma(persist_directory=f"../data/chroma/{self.fileName}", embedding_function=embeddings, collection_name=self.fileName)
            return
        
        if os.path.exists(f"../data/{self.fileName}.pkl"):
            self.load_from_pkl()
        else:
            self.load_from_pdf()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=0
        )
        self.texts = self.text_splitter.split_documents(self.data)

        self.docsearch = Chroma.from_texts(
            [t.page_content for t in self.texts],
            embeddings,
            collection_name=self.fileName,
            persist_directory=f"../data/chroma/{self.fileName}",
        )

        self.docsearch.persist()

    def query(self, q: str):
        docs = self.docsearch.similarity_search(q, include_metadata=True)
        chain = load_qa_chain(llm, chain_type="stuff")
        return str(chain.run(input_documents=docs, question=q)).strip()

    def summarize(self):
        chain = load_summarize_chain(llm, chain_type="map_reduce")
        return str(chain.run(self.texts[:5])).strip()


In [9]:
deepfake = ChatDoc("deepfake")

Using embedded DuckDB with persistence: data will be stored in: ../data/chroma/deepfake


In [12]:
deepfake.query("Summarize the document?")

'This document discusses potential solutions to the problem of deepfakes, which are AI-generated images and videos that can be used to spread false information. One solution is C2PA, which cryptographically signs any content created by a device and documents who captured the image, where, and when. Other options include fingerprinting and watermarking images and videos. AI tools have been proposed to spot deepfakes, but this technology is not reliable enough to keep up with the constantly changing generative AI tools. Another potential solution is to develop an instant fact-checker for social media users, which would quickly inform users of the veracity of a piece of content.'