In [None]:
%load_ext dotenv
%dotenv
import openai
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast

API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
RESOURCE_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"
url = openai.api_base + "/openai/deployments?api-version=2022-12-01"

r = requests.get(url, headers={"api-key": API_KEY})

print(r.text)

In [None]:
# Taken from https://azure.microsoft.com/en-us/free/free-account-faq/
df_faq = pd.read_csv("data/azure-free-account-faq.csv") 
df_faq

In [None]:
# s is input text
def normalize_text(s, sep_token = " \n "):
    print(s)
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

df_faq['answer'] = df_faq["answer"].apply(lambda x : normalize_text(x))

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
df_faq['n_tokens'] = df_faq["answer"].apply(lambda x: len(tokenizer.encode(x)))

# remove any answers that are too long for the token limit (~2000 tokens).
df_faq = df_faq[df_faq.n_tokens<2000]
df_faq

In [None]:
# understanding tokenization
# https://platform.openai.com/tokenizer

print(df_faq.answer[0])

understand_tokenization = tokenizer.tokenize(df_faq.answer[0])
understand_tokenization
len(understand_tokenization)

In [None]:
df_faq['curie_search'] = df_faq["answer"].apply(lambda x : get_embedding(x, engine = 'embedding-model'))
df_faq

In [None]:
# search through the reviews for a specific product
def search_docs(df, user_query, top_n=3, to_print=True):
    # get embedding for user query
    embedding = get_embedding(
        user_query,
        engine="embedding-model"
    )
    # calculate cosine similarity between user query and all reviews
    df["similarities"] = df.curie_search.apply(lambda x: cosine_similarity(x, embedding))
    # sort by similarity and return top n
    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print:
        display(res)
    return res



In [None]:
user_question = "how much the service cost?"
res = search_docs(df_faq, user_question, top_n=2)

In [None]:
print(f"best match answer: \n {res.answer.iloc[0]}")
print(f"answer 2: \n {res.answer.iloc[1]}")


In [None]:
prompt = f"The user asked: {user_question}, and the answers are:\n\n"
for idx, r in enumerate(res["answer"].values):
    prompt += f"""answer {idx+1}: {r}\n\n"""
prompt += "\n combine the information in "
for i in range(1, len(res["answer"].values)+1):
    prompt += f" answer {i} and "
# remove the last and
prompt = prompt[:-4]
prompt += " and rephrase into a single answer."    
print("The prompt is:\n")
print(prompt)


In [None]:
response = openai.Completion.create(engine="davinchi-003", prompt=prompt, max_tokens=1024)
text = response['choices'][0]['text'].replace('\n', '').replace(' .', '.').strip()
print(text)

In [None]:
prompt = """
How do I get popular services free for 12 months?

"""



response = openai.Completion.create(engine="davinchi-003", prompt=prompt, max_tokens=1024)
text = response['choices'][0]['text'].replace('\n', '').replace(' .', '.').strip()
print(text)
