## This notebook has streamlit implementation of chatbot over private documents.
### The key differences between simple Q&A over private docs and this project is
### 1. We use refine query to complete incomplete queries given by user based on the context and document
### 2. We use chat_history to remember previous answers and store them to memory buffer window for further use.
## To run this app simply activate venv and run "streamlit run [file_path_of_main.py]" in terminal

## Prerequisites:
### Install langchain, and chromadb

### This is page 1 of streamlit app (main.py)

In [3]:
# import and install the libraries
from PyPDF2 import PdfReader
from langchain import OpenAI
from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings, CohereEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import streamlit as st
from streamlit_chat import message
from langchain import Cohere
import os
from langchain.prompts import PromptTemplate

AttributeError: module 'lib' has no attribute 'X509_V_FLAG_CB_ISSUER_CHECK'

In [None]:
# this function creates or overwrites a file
#overwrites in the following manner
# 1. llm_name
# 2. llm_key
# 3. text of the file uploaded

def append_to_file(file_path, llm, api, text):
    try:
        with open(file_path, 'w', encoding="utf-8") as file:
            file.write(str(llm) + '^' + api + '^' + text + '^')
        print("Parameters appended to the file successfully.")
    except IOError:
        print("An error occurred while appending parameters to the file.")

In [None]:
# this function checks if the given input file is of pdf or text format
# and extracts text based on type of document

def read_and_print_file(file):
    # if file is pdf
    if uploaded_file.type == 'application/pdf':
        pdf_reader = PdfReader(file)
        num_pages = len(pdf_reader.pages)
        #appends to string called text
        text = ""
        for page in range(num_pages):
            text += pdf_reader.pages[page].extract_text()
        return text
    # if file is text
    elif uploaded_file.type == 'text/plain':
        return file.read().decode('utf-8')
    else:
        return 'Unsupported file format. Only PDF and text files are accepted.'

In [None]:
# This code is the main function in Page 1 (main.py)
# Page title
st.set_page_config(page_title='Chatbot - Private Docs')
st.title('Chatbot - Private Docs')

with st.form('story_form', clear_on_submit=False):
    #uploads a file here
    uploaded_file = st.file_uploader("Choose a Text/PDF file", type=['pdf', 'txt'], accept_multiple_files=False)

    with st.sidebar:
        #select type of llm and key
        llm_name = st.radio('Select LLM type: ', ('OpenAI', 'Cohere'))
        api_key = st.text_input('LLM API Key', type='password')
        submitted = st.form_submit_button('Submit')

# validates key and submit button and writes in the file and creates a markdownto go to page 2
if (submitted and api_key.startswith('4a')) or (submitted and api_key.startswith('sk-') and uploaded_file != ''):
    with st.spinner('Calculating...'):
        append_to_file('pages/file.txt', llm_name, api_key, read_and_print_file(uploaded_file))
        st.markdown('<a href="/QA_Page" target="_self">Go to Chat -></a>', unsafe_allow_html=True)


## The below code is for Page 2 (2_QA_Page.py)

In [None]:
# this function writes to a file text to file_path
def append_to_file(file_path, text):
    try:
        with open(file_path, 'w', encoding="utf-8") as file:
            file.write(str(text))
        print("Parameters appended to the file successfully.")
    except IOError:
        print("An error occurred while appending parameters to the file.")


In [None]:
#this function reads from the file and returns the string as response
def read_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as file:
        response = file.read()
        return response

In [None]:
# this method is used to refine the query based on previous context
def query_refiner(conversation, query, llm, api):
    # set llm
    if llm == 'Cohere':
        llm_main = Cohere(cohere_api_key=api)
    else:
        llm_main = OpenAI(temperature=0)

    #prompt
    file = '\nCONVERSATION:\n' + conversation + '\nQUERY:\n' + query
    template = """The following text contains 2 parts, that is the previous conversations between \
    human and bot and current query. The query might be incomplete as it is a conversation with context. \
    Your task is to reconstruct the query into a more meaningful question that is related to the previous context. \
    If the conversation is blank or the query is already good , just keep the input query same. \
    Text: {file}"""
    prompt = PromptTemplate(input_variables=["file"],
                            template=template)
    prompt_temp = prompt.format(file=file)
    query_new = llm_main(prompt_temp)
    return query_new.strip()

In [None]:
# this function reads the file and splits it based on '^' symbol
def splitter(response):
    llm = (response.split('^')[0])
    api = (response.split('^')[1])
    file = (response.split('^')[2])
    return llm, api, file

In [None]:
# this function is used to create embeddings from the given file and upload it to cromadb
# we also set conversationbuffermemory to chat history here

def get_embeddings(documents, llm, api):
    if documents != '':
        if llm == 'OpenAI':
            os.environ["OPENAI_API_KEY"] = api
            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
            documents = text_splitter.split_documents(documents)
            #create embeddings on OpenAI
            embeddings = OpenAIEmbeddings()
            # store in chromadb
            vectorstore = Chroma.from_documents(documents, embeddings)
            # set memory of chain
            memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
            qa2 = ConversationalRetrievalChain.from_llm(OpenAI(temperature=0.6), vectorstore.as_retriever(),
                                                        memory=memory)
            return qa2
        else:
            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
            documents = text_splitter.split_documents(documents)
            #create embeddings on Cohere
            embeddings = CohereEmbeddings(cohere_api_key=api)
            #store in chromadb
            vectorstore = Chroma.from_documents(documents, embeddings)
            # set memory of chain
            memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
            qa2 = ConversationalRetrievalChain.from_llm(Cohere(cohere_api_key=api), vectorstore.as_retriever(),
                                                        memory=memory)
            return qa2

In [None]:
# this function loads the file and loads the text
def file_loader(filename):
    if '.txt' in filename:
        loader = TextLoader(r"filename", encoding="utf-8")
        documents = loader.load()
        return documents
    else:
        return ''

In [None]:
# this function gets the query from user
def get_text():
    input_text = st.text_input("Query: ", "", key="input")
    return input_text

In [None]:
#this is the main function of Page 2

# Page title
st.set_page_config(page_title="Chatbot - Private Docs", page_icon=":robot:")
st.header("Chatbot - Private Docs")

#extract llm, api and text
llm, api, text = splitter(read_file('pages/file.txt'))
append_to_file('pages/tempfile.txt', text)

# load text
docs = TextLoader(r"pages/tempfile.txt", encoding="utf-8").load()

# get embeddings
qa = get_embeddings(docs, llm, api)

#empty string to store conversations
conversation_str = ""

if "generated" not in st.session_state:
    st.session_state["generated"] = []

if "past" not in st.session_state:
    st.session_state["past"] = []

# refine the query
query = get_text()
result = qa({"question": query_refiner(conversation_str, text, query, llm, api)})

if query:
    # append it to the conversation string
    conversation_str = conversation_str + '\nHuman: ' + result['question']
    conversation_str = conversation_str + '\nBot: ' + result['answer']
    st.session_state.past.append(result['question'])
    st.session_state.generated.append(result['answer'])

if st.session_state["generated"]:
    #display in streamlit chat interface
    for i in range(len(st.session_state["generated"]) - 1, -1, -1):
        message(st.session_state["generated"][i], key=str(i))
        message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
