In [None]:
%load_ext dotenv
%dotenv
import openai
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast
from pprint import pprint

API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
RESOURCE_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"
print(openai.api_base)
url = openai.api_base + "/openai/deployments?api-version=2022-12-01"

r = requests.get(url, headers={"api-key": API_KEY})

print(r.text)

In [None]:
def normalize_text(s, sep_token = " \n "):
    print(s)
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s


In [None]:
from glob import glob
print(glob("data/*.txt"))
# FAQ Example
with open("data/openning bank account - FAQ.txt", "r") as file:
    doc = file.read()

prompt = f"""
            {normalize_text(doc)}
            Create a list of questions and answer based on the FAQ document above. 
            Create the output as a readable json format with the following format:
            
            [
                {{"<the question1>":"<the answer1>"\}}
                {{"<the question2>":"<the answer2>"\}}
            ]
            """    
response = openai.Completion.create(engine="davinchi-003", prompt=prompt, max_tokens=1024)
text = response['choices'][0]['text']
    

In [None]:
import json
print(text)
new_text = json.loads(text)

pprint(new_text)

In [None]:
import pandas as pd

# Define the input list of JSON objects
json_list = []

for l in new_text:
    json_list.append([list(l.keys())[0], list(l.values())[0]])
# Convert the list of JSON objects to a pandas dataframe
df = pd.DataFrame(json_list)
# Rename the columns of the dataframe
df.columns = ['questions', 'answers']

# Print the resulting dataframe

df['curie_search_answers'] = df["answers"].apply(lambda x : get_embedding(x, engine = 'embedding-model'))
df['curie_search_questions'] = df["questions"].apply(lambda x : get_embedding(x, engine = 'embedding-model'))
print(df)



In [None]:
def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        engine="embedding-model"
    )
    # print(embedding)
    df["similarities_answers"] = df.curie_search_answers.apply(lambda x: cosine_similarity(x, embedding))
    df["similarities_questions"] = df.curie_search_questions.apply(lambda x: cosine_similarity(x, embedding))
    
    res = (
        df.sort_values("similarities_answers", ascending=False)
        .head(top_n)
    )
    return res

def rephrase_answer(question, res, th_answers=0.8, th_questions=0.8):
    answers = res["answers"]
    similarities_answers = res["similarities_answers"]
    similarities_questions = res["similarities_questions"]
    prompt = f"""
            The user asked: {question}, and the answers are:
            """
    answer_counter = 0
    for k,v in answers.items():
        if similarities_answers[k] > th_answers:
            prompt += f"""
            {v} \n
            """
            answer_counter += 1
    
    if answer_counter == 0:
        print("no answer found to the question, looking for similar questions")
        for k,v in res["questions"].items():
            if similarities_questions[k] > th_questions:
                prompt += f"""
                {v}
                """
                print(res["questions"][k])
                return res["answers"][k]
        return "Sorry, I don't know the answer to that question."
    
    prompt += "rephrase the answer to the question above."
    print(f"prompt={prompt}")
    response = openai.Completion.create(engine="davinchi-003", prompt=prompt, max_tokens=400)
    text = response['choices'][0]['text']
    return(text)

In [None]:
user_question = "will it affect my credit"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)

In [None]:
user_question = "Do you accept Quebec health card?"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)

In [None]:
user_question = "Do you accept an israeli Driver licsnce?"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)

In [None]:
user_question = "When Ben Gurion were born?"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)