In [3]:
!pip install openai pymupdf

Collecting openai
  Downloading openai-1.30.3-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2

In [26]:
!pip install tiktoken slack_sdk

Collecting slack_sdk
  Downloading slack_sdk-3.27.2-py2.py3-none-any.whl (286 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.5/286.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: slack_sdk
Successfully installed slack_sdk-3.27.2


In [37]:
import openai
import os
import pandas as pd
import tiktoken
import json
import fitz
import re
import ast
from scipy import spatial
GPT_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-3-small"
from slack_sdk import WebClient
client1 = WebClient(token="your_slack_token")

client = openai.OpenAI(api_key='your_open_ai_api_key')

def send_to_slack(message):
  result = client1.chat_postMessage(
      channel='C04HA4VAMEU',
      text=message
  )


def split_text(text):

    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text=''

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        #text[page_num]=page.get_text()
        text+=page.get_text()
    return text

def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    query_embedding_response = client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response.data[0].embedding

    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, list(row["embedding"])))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]



def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below text to answer the subsequent question. If the answer cannot be found in the text, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nText Sentence\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the text"},
        {"role": "user", "content": message},
    ]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message









def answering(pdf_doc,questions):
    EMBEDDING_MODEL = "text-embedding-3-small"
    BATCH_SIZE = 1000
    pdf_text=extract_text_from_pdf(pdf_doc)
    pdf_text_list=split_text(pdf_text)

    embeddings = []
    for batch_start in range(0, len(pdf_text_list), BATCH_SIZE):
        batch_end = batch_start + BATCH_SIZE
        batch = pdf_text_list[batch_start:batch_end]
        print(f"Batch {batch_start} to {batch_end-1}")
        response = client.embeddings.create(model=EMBEDDING_MODEL, input=batch)
        for i, be in enumerate(response.data):
            assert i == be.index
        batch_embeddings = [e.embedding for e in response.data]
        embeddings.extend(batch_embeddings)

    df = pd.DataFrame({"text": pdf_text_list, "embedding": embeddings})
    print(df.head())
    df.to_csv('handbook_pdf_embeddings.csv')
    df=pd.read_csv('handbook_pdf_embeddings.csv')
    df.drop('Unnamed: 0',axis=1,inplace=True)
    df.drop_duplicates(inplace=True)
    df['embedding'] = df['embedding'].apply(ast.literal_eval)
    answers=[]
    for i in questions:
      x=ask(i,df)
      answers.append(x)
    qa = dict(zip(questions, answers))
    json_data = json.dumps(qa)

    return json_data




In [43]:
questions=['What is the name of the company?',
'Who is the CEO of the company?',
'What is their vacation policy?',
'What is the termination policy?'
]

output=answering('handbook.pdf',questions)
output

Batch 0 to 999
                                                text  \
0                                        Zania, Inc.   
1  Zania Employee Handbook\nSeptember 07, 2023\nT...   
2     You have just joined a dedicated organization.   
3      We hope that your employment with Zania, Inc.   
4                will be rewarding\nand challenging.   

                                           embedding  
0  [0.017726914957165718, 0.046172428876161575, 0...  
1  [0.02425610087811947, 0.010685230605304241, 0....  
2  [0.011331990361213684, 0.000718834693543613, 0...  
3  [0.02359427511692047, 0.04291785508394241, -0....  
4  [0.013710535131394863, -0.006862619426101446, ...  


'{"What is the name of the company?": "The name of the company is Zania, Inc.", "Who is the CEO of the company?": "Shruti Gupta is the CEO of the company.", "What is their vacation policy?": "The vacation policy allows full-time regular employees to receive vacation time immediately upon hire, with the amount of vacation received each year based on length of service and granted in a lump sum at the beginning of each year. Employees may accrue a certain number of hours/days/weeks of vacation for every period worked, up to a maximum accrual amount. Unused vacation can be carried over to the following year or the company may offer payment for the unused time. Employees are encouraged to use their vacation time, and requests for vacation must be made in advance. Seniority may determine priority in scheduling vacation times, and unused vacation may be forfeited upon separation of employment unless state law dictates otherwise.", "What is the termination policy?": "The termination policy inc

In [45]:
send_to_slack(output)









In [44]:
output

'{"What is the name of the company?": "The name of the company is Zania, Inc.", "Who is the CEO of the company?": "Shruti Gupta is the CEO of the company.", "What is their vacation policy?": "The vacation policy allows full-time regular employees to receive vacation time immediately upon hire, with the amount of vacation received each year based on length of service and granted in a lump sum at the beginning of each year. Employees may accrue a certain number of hours/days/weeks of vacation for every period worked, up to a maximum accrual amount. Unused vacation can be carried over to the following year or the company may offer payment for the unused time. Employees are encouraged to use their vacation time, and requests for vacation must be made in advance. Seniority may determine priority in scheduling vacation times, and unused vacation may be forfeited upon separation of employment unless state law dictates otherwise.", "What is the termination policy?": "The termination policy inc