In [None]:
#!pip install pypdf
#!pip install langchain
#!pip install -U sentence-transformers
#!pip install chromadb
#!pip install GPT4All

import os
import sys
import pandas as pd
from langchain_community.document_loaders import PyPDFLoader
import pypdf
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import GPT4All
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
# define init index
INIT_INDEX = os.getenv('INIT_INDEX', 'false').lower() == 'true'

# vector index persist directory
INDEX_PERSIST_DIRECTORY = os.getenv('INDEX_PERSIST_DIRECTORY', "./data/chromadb")

# http api port
HTTP_PORT = os.getenv('HTTP_PORT', 7654)

# mongodb config host, username, password
MONGO_HOST = os.getenv('MONGO_HOST', 'localhost')
MONGO_PORT = os.getenv('MONGO_PORT', 27017)
MONGO_USER = os.getenv('MONGO_USER', 'testuser')
MONGO_PASS = os.getenv('MONGO_PASS', 'testpass')

#global conversation
#conversation = None

# Build RAG Dataset

In [None]:
def recursive_PDF_loader(directory):
    """
    Recursively list all PDF files in a directory.
    """
    paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                path = os.path.join(root, file)
                loader_PDF = PyPDFLoader(path)
                #loader_PDF.load()
                #print(len(loader_PDF))
                paths.append(loader_PDF)
    return paths
    

In [None]:
base_folder = "reference_content/Docs"
pdf_files = recursive_PDF_loader(base_folder)
pdf_files 

[<langchain_community.document_loaders.pdf.PyPDFLoader at 0x1076e2800>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x1076e25c0>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x1076e3c70>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x10770ccd0>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x10770d870>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c3db190>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41dc00>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41d420>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41c400>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41c5e0>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41c4c0>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41cf10>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c41dab0>,
 <langchain_community.document_loaders.pdf.PyPDFLoader at 0x12c4

In [None]:
from langchain_community.document_loaders.merge import MergedDataLoader

loader_all = MergedDataLoader(loaders=pdf_files)
loader_all

<langchain_community.document_loaders.merge.MergedDataLoader at 0x10770dfc0>

In [None]:
docs = loader_all.load()
len(docs)

5204

In [None]:
# split text
# this chunk_size and chunk_overlap effects to the prompt size
# execeed promt size causes error `prompt size exceeds the context window size and cannot be processed`
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
# create embeddings with huggingface embedding model `all-MiniLM-L6-v2`
# then persist the vector index on vector db
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    persist_directory=INDEX_PERSIST_DIRECTORY
)
vectordb.persist()


  from .autonotebook import tqdm as notebook_tqdm
Exception occurred invoking consumer for subscription 360e4b8a8f0f465396296d8c1e4e51c7to topic persistent://default/default/b13a069f-3226-442d-84d0-2a1567b02a8c 'utf-8' codec can't encode character '\ud835' in position 834: surrogates not allowed
Exception occurred invoking consumer for subscription 360e4b8a8f0f465396296d8c1e4e51c7to topic persistent://default/default/b13a069f-3226-442d-84d0-2a1567b02a8c 'utf-8' codec can't encode character '\ud835' in position 36: surrogates not allowed
Exception occurred invoking consumer for subscription 360e4b8a8f0f465396296d8c1e4e51c7to topic persistent://default/default/b13a069f-3226-442d-84d0-2a1567b02a8c 'utf-8' codec can't encode character '\ud835' in position 530: surrogates not allowed
Exception occurred invoking consumer for subscription 360e4b8a8f0f465396296d8c1e4e51c7to topic persistent://default/default/b13a069f-3226-442d-84d0-2a1567b02a8c 'utf-8' codec can't encode character '\ud835' in 

In [None]:
'''
def init_conversation():
    global conversation

    # load index
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = Chroma(persist_directory=INDEX_PERSIST_DIRECTORY,embedding_function=embeddings)

    # create conversation
    llm = GPT4All(
        model="nous-hermes-llama2-13b.Q4_0.gguf",
        verbose=True,
    )
    conversation = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=vectordb.as_retriever(),
        return_source_documents=True,
        verbose=True,
    )


def chat(question, user_id):
    global conversation

    chat_history = []
    response = conversation({"question": question, "chat_history": chat_history})
    answer = response['answer']

    logging.info("got response from llm - %s", answer)

    # TODO save history

    return answer
'''

'\ndef init_conversation():\n    global conversation\n\n    # load index\n    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")\n    vectordb = Chroma(persist_directory=INDEX_PERSIST_DIRECTORY,embedding_function=embeddings)\n\n    # create conversation\n    llm = GPT4All(\n        model="nous-hermes-llama2-13b.Q4_0.gguf",\n        verbose=True,\n    )\n    conversation = ConversationalRetrievalChain.from_llm(\n        llm,\n        retriever=vectordb.as_retriever(),\n        return_source_documents=True,\n        verbose=True,\n    )\n\n\ndef chat(question, user_id):\n    global conversation\n\n    chat_history = []\n    response = conversation({"question": question, "chat_history": chat_history})\n    answer = response[\'answer\']\n\n    logging.info("got response from llm - %s", answer)\n\n    # TODO save history\n\n    return answer\n'

In [None]:
def init_conversation():
    #lobal conversation

    # load index
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectordb = Chroma(persist_directory=INDEX_PERSIST_DIRECTORY,embedding_function=embeddings)

    # create conversation
    llm = GPT4All(
        #model="nous-hermes-llama2-13b.Q4_0.gguf",
        model="nous-hermes-llama2-13b.Q2_K.gguf",
        verbose=True,
    )
    conversation = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=vectordb.as_retriever(),
        return_source_documents=True,
        verbose=True,
    )
    return conversation


def chat(question, user_id, conversation):
    #global conversation

    chat_history = []
    response = conversation({"question": question, "chat_history": chat_history})
    answer = response['answer']

    logging.info("got response from llm - %s", answer)

    # TODO save history

    return answer


In [None]:
#import os
#import wget
#mod_path = '/Users/akeem/.cache/gpt4all/nous-hermes-llama2-13b.Q4_0.gguf'
#if not os.path.exists(mod_path):
#    wget.download('https://huggingface.co/TheBloke/Nous-Hermes-Llama2-GGUF/blob/main/nous-hermes-llama2-13b.Q4_0.gguf', mod_path)


In [None]:
conversation = init_conversation()
#print(vector_db.get_vector("What is an embedding?"))
#conversation = chat("What is an embedding?", "user1", conversation)
#print(resp)

: 