In [13]:
%load_ext dotenv
%dotenv
import openai
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast
from pprint import pprint

API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
RESOURCE_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"
print(openai.api_base)
url = openai.api_base + "/openai/deployments?api-version=2022-12-01"

r = requests.get(url, headers={"api-key": API_KEY})

print(r.text)

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
https://amit-sandbox.openai.azure.com/
{
  "data": [
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-ada-001",
      "owner": "organization-owner",
      "id": "Ada1",
      "status": "succeeded",
      "created_at": 1676559783,
      "updated_at": 1676559783,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-embedding-ada-002",
      "owner": "organization-owner",
      "id": "embedding-model",
      "status": "succeeded",
      "created_at": 1676563276,
      "updated_at": 1676563276,
      "object": "deployment"
    },
    {
      "scale_settings": {
        "scale_type": "standard"
      },
      "model": "text-davinci-003",
      "owner": "organization-owner",
      "id": "davinchi-003",
      "status": "succeeded",
      "created_at": 1676563605,
      "updated_at": 167656360

In [14]:
def normalize_text(s, sep_token = " \n "):
    print(s)
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s


In [15]:
from glob import glob
print(glob("data/*.txt"))
# FAQ Example
with open("data/openning bank account - FAQ.txt", "r") as file:
    doc = file.read()

prompt = f"""
            {normalize_text(doc)}
            Create a list of questions and answer based on the FAQ document above. 
            Create the output as a readable json format with the following format:
            
            [
                {{"<the question1>":"<the answer1>"\}}
                {{"<the question2>":"<the answer2>"\}}
            ]
            """    
response = openai.Completion.create(engine="davinchi-003", prompt=prompt, max_tokens=1024)
text = response['choices'][0]['text']
    

['data/openning bank account - FAQ.txt']
Q: What documents do I need to bring with me to open a bank account at this bank in the US?
A: You will need to bring a valid government-issued photo ID, such as a driver's license or passport, and proof of your current address, such as a utility bill or lease agreement.

Q: Can I open a bank account if I don't have a Social Security Number?
A: Yes, you can open a bank account without a Social Security Number, but you will need to provide an Individual Taxpayer Identification Number (ITIN) instead.

Q: What types of bank accounts are available at this bank?
A: This bank offers a variety of bank accounts, including checking accounts, savings accounts, money market accounts, and CDs.

Q: Is there a minimum balance requirement to open and maintain a bank account?
A: Yes, there is a minimum balance requirement for each type of account. The amount varies depending on the type of account you choose.

Q: Are there any monthly fees associated with maint

In [18]:
import json
print(text)
new_text = json.loads(text)

pprint(new_text)


            [
                {"What documents do I need to bring with me to open a bank account at this bank in the US?":"You will need to bring a valid government-issued photo ID, such as a driver's license or passport, and proof of your current address, such as a utility bill or lease agreement."},
                {"Can I open a bank account if I don't have a Social Security Number?": "Yes, you can open a bank account without a Social Security Number, but you will need to provide an Individual Taxpayer Identification Number (ITIN) instead."},
                {"What types of bank accounts are available at this bank?":"This bank offers a variety of bank accounts, including checking accounts, savings accounts, money market accounts, and CDs."},
                {"Is there a minimum balance requirement to open and maintain a bank account?": "Yes, there is a minimum balance requirement for each type of account. The amount varies depending on the type of account you choose."},
           

JSONDecodeError: Unterminated string starting at: line 10 column 109 (char 1875)

In [None]:
import pandas as pd

# Define the input list of JSON objects
json_list = []

for l in new_text:
    json_list.append([list(l.keys())[0], list(l.values())[0]])
# Convert the list of JSON objects to a pandas dataframe
df = pd.DataFrame(json_list)
# Rename the columns of the dataframe
df.columns = ['questions', 'answers']

# Print the resulting dataframe

df['curie_search_answers'] = df["answers"].apply(lambda x : get_embedding(x, engine = 'embedding-model'))
df['curie_search_questions'] = df["questions"].apply(lambda x : get_embedding(x, engine = 'embedding-model'))
print(df)



                                           questions  \
0               Can applying impact my credit score?   
1  Can I apply in-store for the Hudson's Bay Mast...   
2                   Why is my application in review?   
3        What forms of identification do you accept?   

                                             answers  \
0  When you apply for a credit card, it may affec...   
1  Yes, you can! You can apply in-store at a Huds...   
2  To approve your application, we need to verify...   
3  To apply for any of the financial services we ...   

                                curie_search_answers  \
0  [0.0033241556957364082, -0.031376685947179794,...   
1  [-0.0011646063067018986, -0.013958116993308067...   
2  [0.01236796099692583, -0.026605697348713875, 0...   
3  [0.007008134387433529, -0.019945211708545685, ...   

                              curie_search_questions  
0  [-0.012194481678307056, -0.03544021025300026, ...  
1  [0.009202724322676659, -0.017308557406067848

In [11]:
def search_docs(df, user_query, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        engine="embedding-model"
    )
    # print(embedding)
    df["similarities_answers"] = df.curie_search_answers.apply(lambda x: cosine_similarity(x, embedding))
    df["similarities_questions"] = df.curie_search_questions.apply(lambda x: cosine_similarity(x, embedding))
    
    res = (
        df.sort_values("similarities_answers", ascending=False)
        .head(top_n)
    )
    return res

def rephrase_answer(question, res, th_answers=0.8, th_questions=0.8):
    answers = res["answers"]
    similarities_answers = res["similarities_answers"]
    similarities_questions = res["similarities_questions"]
    prompt = f"""
            The user asked: {question}, and the answers are:
            """
    answer_counter = 0
    for k,v in answers.items():
        if similarities_answers[k] > th_answers:
            prompt += f"""
            {v} \n
            """
            answer_counter += 1
    
    if answer_counter == 0:
        print("no answer found to the question, looking for similar questions")
        for k,v in res["questions"].items():
            if similarities_questions[k] > th_questions:
                prompt += f"""
                {v}
                """
                print(res["questions"][k])
                return res["answers"][k]
        return "Sorry, I don't know the answer to that question."
    
    prompt += "rephrase the answer to the question above."
    print(f"prompt={prompt}")
    response = openai.Completion.create(engine="davinchi-003", prompt=prompt, max_tokens=400)
    text = response['choices'][0]['text']
    return(text)

In [12]:
user_question = "will it affect my credit"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)

NameError: name 'df' is not defined

In [18]:
user_question = "Do you accept Quebec health card?"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)

prompt=
            The user asked: Do you accept Quebec health card?, and the answers are:
            
            To apply for any of the financial services we offer, you must submit identification as part of the application process. We want to always make sure that you really are you! Note: The photos submitted for your application must be taken in real-time during your application process. These photos will be deleted after 90 days. We accept the following forms of ID:  * Provincial driver's license * Provincial identification card * Quebec health card * British Columbia Services Card * British Columbia Health card * Permanent resident card * Canadian passport * Canadian citizenship card issued prior to 2012 * Indian status card We don’t accept the following forms of ID: * Identification issued outside Canada * Canadian identification without a photo * Identification that has been photocopied, scanned, or photographed at a previous date * Expired identification card * Ontario heal

In [19]:
user_question = "Do you accept an israeli Driver licsnce?"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)

no answer found to the question, looking for similar questions

question: Do you accept an israeli Driver licsnce?

answer:

To apply for any of the financial services we offer, you must submit identification as part of the application process. We want to always make sure that you really are you! Note: The photos submitted for your application must be taken in real-time during your application process. These photos will be deleted after 90 days. We accept the following forms of ID:  * Provincial driver's license * Provincial identification card * Quebec health card * British Columbia Services Card * British Columbia Health card * Permanent resident card * Canadian passport * Canadian citizenship card issued prior to 2012 * Indian status card We don’t accept the following forms of ID: * Identification issued outside Canada * Canadian identification without a photo * Identification that has been photocopied, scanned, or photographed at a previous date * Expired identification card * Onta

In [None]:
user_question = "When Ben Gurion were born?"
res = search_docs(df, user_question, top_n=2)
answer = rephrase_answer(user_question, res.to_dict())
print("")
print(f"question: {user_question}")
print("")
print("answer:")
print("")
print(answer)


question: When Ben Gurion were born?

answer:

Sorry, I don't know the answer to that question.
