In [None]:
import pandas as pd
from langchain import OpenAI, FAISS, PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os


In [None]:
os.environ["OPENAI_API_KEY"] = 

# Load pdfs, create vector stores for multiple file embedding

Credit for the pdf_loader and embed_index from https://github.com/insightbuilder/python_de_learners_data/blob/main/code_script_notebooks/projects/LLM_practical_appln/multiFileEmbedFaiss.ipynb


In [None]:


def pdf_loader(file):

    loader = PyPDFLoader(file)
    pages = loader.load_and_split()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10, length_function = len)

    doc_list = []
    #Pages will be list of pages, so need to modify the loop
    for pg in pages:
        pg_splits = text_splitter.split_text(pg.page_content)
        doc_list.extend(pg_splits)

    return doc_list

def embed_index(doc_list, embed_fn, index_store):
  """Function takes in existing vector_store, 
  new doc_list and embedding function that is 
  initialized on appropriate model. Local or online. 
  New embedding is merged with the existing index. If no 
  index given a new one is created"""
  #check whether the doc_list is documents, or text
  try:
    faiss_db = FAISS.from_documents(doc_list, 
                              embed_fn)  
  except Exception as e:
    faiss_db = FAISS.from_texts(doc_list, 
                              embed_fn)
  
  if os.path.exists(index_store):
    local_db = FAISS.load_local(index_store,embed_fn)
    #merging the new embedding with the existing index store
    local_db.merge_from(faiss_db)
    print("Merge completed")
    local_db.save_local(index_store)
    print("Updated index saved")
  else:
    faiss_db.save_local(folder_path=index_store)
    print("New store created...")


## Create embed index

In [None]:
# Model embeddings
embedding_model = OpenAIEmbeddings()


In [None]:
# Load docs
doc_1 = pdf_loader("YOUR LINK HERE")
doc_2 = pdf_loader("YOUR LINK HERE")

In [None]:
embed_index(doc_list= doc_1,
            embed_fn= embedding_model,
            index_store= 'new_index')

In [None]:
embed_index(doc_list= doc_2,
            embed_fn= embedding_model,
            index_store= 'new_index')

In [None]:
vector_index = FAISS.load_local("new_index", OpenAIEmbeddings())
retreiver = vector_index.as_retreiever(search_type = "similarity", search_kwargs = {"k": 6})

In [None]:
prompt_template = """You are a document comparison assistant. You are tasked with finding discrepancies between documents.
	You are provided two documents. Identify and list all differences between the document. 

    Be polite with your responses.
    
    Context:{context}

    QUERY: {question}
    """
prompt = PromptTemplate(template = prompt_template)

In [None]:

# Specify Retrieval chain inputs
model_name = "gpt-3.5-turbo-instruct"
llm = OpenAI(model = model_name, temperature = 0)

qa = RetrievalQA.from_llm(llm = llm, retriever=retreiver, prompt=prompt, return_source_documents = True)

In [None]:
def ask_question_with_context(qa, question, context):
    """Function to ask question. In the context field, indicate whether the docs are identical copies for word level differences. Otherwise the comparison is topic level"""
    query = "A comparison between the two provided docs"
    result = qa({"question": question, "context": context})
    print("Answer:", result["answer"])
    context = [(query, result["answer"])]
    return context
               