## Install the required libraries

In [78]:
!pip install openai
!pip install PyPDF2
!pip install tiktoken
!pip install -U sentence-transformers



#Import the required libraries.

In [91]:
import openai
import PyPDF2
import pandas as pd
import re
import tiktoken
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

In [92]:
print(openai.__version__)

1.35.8


#Load the pre-trained transformer model. This model I have used to get embeddings of each page and the given question.

In [93]:
# Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")



# Define the secret key

In [94]:
# Provide the secret key to use the OpenAI API
openai_secret_key = "OPENAI-API-KEY"
client = openai.OpenAI(api_key= openai_secret_key)

In [95]:
def clean_text(text):
    '''Function to replace next line with space'''
    text = re.sub(r'\n', ' ', text)
    cleaned_text = text.replace("\n"," ")
    return cleaned_text

In [96]:
def text_extraction_from_pdf(pdf_file_path):
    '''Function to extract text from pdf'''
    df = pd.DataFrame(columns = ["Page_Number", "Segments"])
    reader = PyPDF2.PdfReader(pdf_file_path)
    total_pages = len(reader.pages)

    text_in_pages = []
    for i in range(total_pages):
        page = reader.pages[i]
        text_in_pages.append(page.extract_text())

    df["Segments"] = text_in_pages
    df['Segments'] = df["Segments"].apply(lambda x : clean_text(x))

    for j in range(total_pages):
        df["Page_Number"][j] = j+1
    return df

In [97]:
def num_tokens(text, encoding_name= "cl100k_base"):
    """Return the number of tokens in a string."""
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(text))

In [98]:
def search_docs(df, quest, top_n):
    quest_embedding = model.encode(quest).reshape(1,-1)
    df["similarities"] = df.Embeddings.apply(lambda x: cosine_similarity(x.reshape(1,-1), quest_embedding))

    res = df.sort_values("similarities", ascending=False).head(top_n)
    return res

In [99]:
def query_message(quest,df,token_budget):
    '''Function to get the pages that are relevant to the question.'''
    question = f"\n\nQuestion: {quest}"
    message = ""
    for text in df['Segments']:
        if (num_tokens(message + text + question)> token_budget):
            break
        else:
            message += text
    return message + question

In [100]:
def ask(quest,df,token_budget= 16385 - 500):
    '''Function to ask the question and get the answer from the pages that are relevant to the question.'''
    # Number of pages that are relevant to the question.
    number_relevant_pages = 2
    result = search_docs(df, quest, number_relevant_pages)
    quest= quest + " " + "Give the answer of the above question in 1-2 sentence only. Please don't provide extra information which is not mentioned in the question."
    message = query_message(quest, result, token_budget=token_budget)
    introduction = 'Use the following text only to answer the subsequent question. If the answer cannot be found in the provided text, write "Data Not Available."'

    messages = [
        {"role": "system", "content": introduction},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model= "gpt-3.5-turbo-0125",
        messages=messages,
        temperature=0
    )

    response_message = response.choices[0].message.content
    return response_message

In [101]:
def get_answers_from_given_pdf_file(pdf_file_path, question):
    df = text_extraction_from_pdf(pdf_file_path)
    df['Embeddings'] = df["Segments"].apply(lambda x : model.encode(x))
    answer= ask(question, df)
    return answer

In [102]:
ques1 = "What is the name of the company?"
ques2 = "Who is the CEO of the company?"
ques3 = "What is their vacation policy?"
ques4 = "What is the termination policy?"

pdf_file_path = "/content/handbook.pdf"
all_ques=[ques1, ques2, ques3, ques4]
all_answers = {}
for ques in all_ques:
    answer= get_answers_from_given_pdf_file(pdf_file_path, ques)
    all_answers[ques] = answer
answer_in_json_form = json.dumps(all_answers)


In [103]:
answer_in_json_form

