In [1]:
import sys
sys.path.append('../backend')

In [2]:
import modules.document.utils.policyqa_parser as policyqa_parser

policy_doc = policyqa_parser.parse_policy_text_by_title("ticketmaster.com", "../data/dev.json")
print(policy_doc.text)


Live Nation Entertainment Privacy Policy - Your Privacy Rights Effective July 20, 2012 (last updated October 08, 2013)
This Privacy Policy applies to the sites and apps where it appears. This Policy describes how we treat personal information we collect both online and offline. This includes on our websites or in our apps. It also includes at our box offices or in phone or email interactions you have with us. If you live in Canada, please read our Canadian Privacy Policy.
We collect information from and about you. Contact information. For example, we might collect your name and street address. We might also collect your phone number or email.
Payment and billing information. For example, we collect your credit card number and zip code when you buy a ticket.
Information you post. For example, we collect information you post in a public space on our website or on a third-party social media site.
Demographic information. We may collect information about events you like or products you bu

In [3]:
policyqa_parser.get_all_policy_titles("../data/dev.json")

['ticketmaster.com',
 'theatlantic.com',
 'sci-news.com',
 'yahoo.com',
 'style.com',
 'adweek.com',
 'cariboucoffee.com',
 'kaleidahealth.org',
 'fool.com',
 'buffalowildwings.com',
 'post-gazette.com',
 'internetbrands.com',
 'mlb.mlb.com',
 'eatchicken.com',
 'ted.com',
 'naturalnews.com',
 'cbsinteractive.com',
 'washingtonian.com',
 'dogbreedinfo.com',
 'walmart.com']

In [4]:
from config import config

config.add_title

True

In [5]:
from modules.document.service import create as create_document

async def save_policy_to_db(policy_doc):
    policy_obj = await create_document(policy_doc)
    return policy_obj

async def save_all_policies_to_db(file_path: str="../data/dev.json"):
    policy_titles = policyqa_parser.get_all_policy_titles(file_path)
    for title in policy_titles:
        policy_doc = policyqa_parser.parse_policy_text_by_title(title, file_path)
        policy_obj = await save_policy_to_db(policy_doc)
        print(f"saved policy id:{policy_obj.id}, title:{policy_obj.title}, doc_type:{policy_obj.doc_type} to database")

# policy_doc = policyqa_parser.parse_policy_text_by_title("rockstargames.com", "../data/test.json")
# policy_obj = await save_policy_to_db(policy_doc)
# print(f"saved policy id:{policy_obj.id}, title:{policy_obj.title}, doc_type:{policy_obj.doc_type} to database")

# Only run this to save all policies to database
# await save_all_policies_to_db("../data/dev.json")

In [6]:
from modules.answer.schemas import AnswerCreate
from modules.llm.llm_infos import Model

request = AnswerCreate(
    question="in ticketmaster.com, Does the company collect user's financial information?",
    model=Model.Gpt4,
    prompt="Answer the question by copying exactly a portion of the contexts, "
           "The portion you copy to answer the question should be as short and concise as possible, "
           "and if the answer is not contained within the text in the contexts, say `I don't know.`"
)


In [7]:
from modules.answer.service import create as create_answer
from collections import deque
import openai

openai.api_key = config.api_key
#question_embedding, answer_embeddings, answer_generator, num_tokens = await create_answer(request)
#answer_text = deque(answer_generator, maxlen=1)[0]

#print(answer_text)
#print(f'num_tokens: {num_tokens}')

In [8]:
from modules.llm.utils import vector_similarity
import pprint
    
def get_relevant_embeddings(answer_embeddings, question_embedding):
    relevant_embeddings = {"data": []}
    for i, embedding in enumerate(answer_embeddings):
        relevant_embedding = {}
        score = vector_similarity(embedding.values, question_embedding)
        relevant_embedding["embedding_id"] = str(embedding.id)
        relevant_embedding["rank"] = i+1
        relevant_embedding["title"] = embedding.document.title
        relevant_embedding["offset"] = embedding.offset
        relevant_embedding["score"] = score
        relevant_embedding["text"] = embedding.text
        relevant_embeddings["data"].append(relevant_embedding)
    return relevant_embeddings

#pprint.pprint(get_relevant_embeddings(answer_embeddings, question_embedding))
#print(get_relevant_embeddings(answer_embeddings, question_embedding))

In [9]:
print(f'The average context size is: {policyqa_parser.calculate_avg_context_size("../data/dev.json")}')
print(f'The average answer size is: {policyqa_parser.calculate_avg_answer_size("../data/dev.json")}')

The average context size is: 430.5452961672474
The average answer size is: 70.21095301125082


In [10]:
file_path = "../data/test.json"
file_name = file_path.split("/")[-1].split(".")[0]
file_name

'test'

In [16]:
import json
from tqdm import tqdm
import time

def count_qas(file_path: str="../data/dev.json"):
    count = 0
    with open(file_path, "r") as f:
        json_data = json.load(f)
        if "data" in json_data:
            for policy in tqdm(json_data["data"]):
                for paragraph in tqdm(policy["paragraphs"]):
                    for qa in paragraph["qas"]:
                        count += 1
                        time.sleep(0.001)
    return count


qa_count = count_qas("../data/dev.json")
print(f'total number of qas is: {qa_count}')

  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/39 [00:00<?, ?it/s][A
100%|██████████| 39/39 [00:00<00:00, 218.78it/s][A
  5%|▌         | 1/20 [00:00<00:03,  5.44it/s]
  0%|          | 0/33 [00:00<?, ?it/s][A
 27%|██▋       | 9/33 [00:00<00:00, 86.22it/s][A
100%|██████████| 33/33 [00:00<00:00, 120.78it/s][A
 10%|█         | 2/20 [00:00<00:04,  4.13it/s]
100%|██████████| 12/12 [00:00<00:00, 172.53it/s]

  0%|          | 0/29 [00:00<?, ?it/s][A
100%|██████████| 29/29 [00:00<00:00, 186.28it/s][A
 20%|██        | 4/20 [00:00<00:02,  6.16it/s]
  0%|          | 0/29 [00:00<?, ?it/s][A
 24%|██▍       | 7/29 [00:00<00:00, 67.08it/s][A
100%|██████████| 29/29 [00:00<00:00, 122.21it/s][A
 25%|██▌       | 5/20 [00:00<00:02,  5.36it/s]
  0%|          | 0/16 [00:00<?, ?it/s][A
100%|██████████| 16/16 [00:00<00:00, 97.44it/s][A
 30%|███       | 6/20 [00:01<00:02,  5.54it/s]
  0%|          | 0/29 [00:00<?, ?it/s][A
 41%|████▏     | 12/29 [00:00<00:00, 115.63it/s][A
100%|██████████

total number of qas is: 3809





In [18]:
import json
from tqdm import tqdm
import time
import logging
import os

rate_limit_per_minute = 500
tokens_per_minute = 10000
delay = 60.0 / rate_limit_per_minute

def get_saved_qas(file_path: str):
    saved_qas = []
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            json_data = json.load(f)
            if "data" in json_data:
                for qa in json_data["data"]:
                    saved_qas.append(qa["id"])
    return saved_qas

async def run_all_policy_qas(file_path: str="../data/dev.json", model: Model=Model.Gpt4):
    running_tokens = 0
    start_time = time.time()
    results = {"data": []}
    file_name = file_path.split("/")[-1].split(".")[0]
    
    # check if there are saved results and load them
    saved_qas = get_saved_qas(f"../results/{file_name}_results.json")
    if len(saved_qas) > 0:
        with open(f"../results/{file_name}_results.json", "r") as f:
            results = json.load(f)
        
    with open(file_path, "r") as f:
        json_data = json.load(f)
        if "data" in json_data:
            for policy in tqdm(json_data["data"]):
                title = policy["title"]
                logging.info(f"processing policy: {title}")
                for i, paragraph in tqdm(enumerate(policy["paragraphs"])):
                    for qa in paragraph["qas"]:
                        q_id = qa["id"]
                        if q_id in saved_qas:
                            print(f"skipping question {q_id} because it is already saved")
                            continue
                        question = qa["question"]
                        answers = qa["answers"]
                        request = AnswerCreate(
                            question=f'in {title}, {question}',
                            model=model,
                            prompt="Answer the question by copying exactly a portion of the contexts, "
                                   "The portion you copy to answer the question should be as short and concise as possible, "
                                   "and if the answer is not contained within the text in the contexts, say `I don't know.`"
                        )

                        if running_tokens > tokens_per_minute*0.8 and time.time() - start_time < 60.0:
                            time.sleep((time.time() - start_time)+15)
                            start_time = time.time()
                            running_tokens = 0
                        
                        question_embedding, answer_embeddings, answer_generator, num_tokens = await create_answer(request)
                        
                        running_tokens += num_tokens
                        
                        time.sleep(delay)
                        answer_text = deque(answer_generator, maxlen=1)[0]
                        relevant_embeddings = get_relevant_embeddings(answer_embeddings, question_embedding)
                        results["data"].append({
                            "title": title,
                            "id": q_id,
                            "question": question,
                            "answers": answers,
                            "pred_answer": answer_text,
                            "relevant_embeddings": relevant_embeddings["data"]
                        })
                    # save results to file after each paragraph
                    with open(f"../results/{file_name}_results.json", "w") as f:
                        json.dump(results, f, indent=4)
                        print(f"saved results to ../results/{file_name}_results.json for title {title} paragraph {i}")
                        
        
        
async def run_all_policy_qas_dev():
    await run_all_policy_qas("../data/dev.json")
    
async def run_all_policy_qas_test():
    await run_all_policy_qas("../data/test.json")

In [19]:
#test GPT-4 responses with small dataset

#await run_all_policy_qas("../data/dev-small.json")

In [20]:
#get all responses for the dev dataset

#await run_all_policy_qas_dev()

In [21]:
from backend.evals.torchmetrics_eval_script import eval_squad_metrics

eval_squad_metrics("../results/dev_results.json")

100%|██████████| 3809/3809 [00:00<00:00, 720845.73it/s]


{'exact_match': tensor(1.9690), 'f1': tensor(17.8223)}

In [22]:
from backend.evals.squad_eval_script import Evaluator
import json

evaluator = Evaluator("../data/dev.json")
em, f1, precision, recall = evaluator.evaluate('../results/dev_results.json')
print(f"Exact Match: {em}\n F1-measure: {f1}\n Precision: {precision}\n Recall: {recall}")

Exact Match: 1.9690207403517983
 F1-measure: 17.992978969749082
 Precision: 15.164048089709462
 Recall: 50.71725889962931


In [7]:
from backend.evals.torchmetrics_eval_script import prepare_retrieval_data
top_k=2
_, indexes, preds, labels = prepare_retrieval_data("../results/dev_results.json", top_k=top_k)
print(len(indexes))
print(len(preds))
print(len(labels))

INFO:root:Preparing data for retrieval metrics
3809it [00:00, 271873.52it/s]

7618
7618
7618





In [8]:
print(indexes[0:15])
print(preds[0:15])
print(labels[0:100])

tensor([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7])
tensor([0.8960, 0.8597, 0.8611, 0.8609, 0.8517, 0.8396, 0.8565, 0.8545, 0.8570,
        0.8496, 0.8649, 0.8593, 0.8574, 0.8493, 0.8716])
tensor([False, False,  True, False, False, False,  True, False, False,  True,
         True, False,  True,  True, False,  True, False, False, False,  True,
        False, False,  True,  True, False,  True, False, False, False,  True,
        False, False,  True, False, False,  True,  True,  True, False,  True,
        False, False,  True,  True, False, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False, False,
         True, False,  True, False,  True, False,  True, False, False,  True,
         True, False,  True, False,  True,  True,  True, False, False, False,
         True, False,  True, False,  True, False, False, False,  True, False,
        False,  True,  True, False, False,  True,  True, False,  True, False])


In [1]:
from backend.evals.torchmetrics_eval_script import eval_retrieval_metrics
from tqdm import tqdm
import logging

# logging.disable(logging.INFO)

retrieval_metrics = []
top_k=5
for i in tqdm(range(1, top_k+1)):
    retrieval_metrics.append(eval_retrieval_metrics("../results/dev_results.json", top_k=i))

for retrieval_metrics in retrieval_metrics:
    print(f'{retrieval_metrics}')

  0%|          | 0/5 [00:00<?, ?it/s]INFO:root:Preparing data for retrieval metrics

3809it [00:00, 761020.53it/s]

  0%|          | 0/3809 [00:00<?, ?it/s][AINFO:root:question_id:ctnk84gy3t6wdijv MAP: 0.0, MRR: 0.0
INFO:root:question_id:jmti4stsogolp49z MAP: 1.0, MRR: 1.0
INFO:root:question_id:el20m9l5wj8fq2om MAP: 0.0, MRR: 0.0
INFO:root:question_id:x3elyri8l6ol3zqr MAP: 1.0, MRR: 1.0
INFO:root:question_id:hezwq0unnhmezwlw MAP: 0.0, MRR: 0.0
INFO:root:question_id:i1matwosph3owqln MAP: 1.0, MRR: 1.0
INFO:root:question_id:hp3z0slx1qkejssd MAP: 1.0, MRR: 1.0
INFO:root:question_id:v883988zklkj8bfe MAP: 0.0, MRR: 0.0
INFO:root:question_id:8vv9g4kcjv90vbw7 MAP: 0.0, MRR: 0.0
INFO:root:question_id:wxoo1ii57ap6xerx MAP: 0.0, MRR: 0.0
INFO:root:question_id:jpfv8krobtqfpoox MAP: 0.0, MRR: 0.0
INFO:root:question_id:eplxx3whxemxs8oc MAP: 1.0, MRR: 1.0
INFO:root:question_id:hok8glgmaxuss2hq MAP: 0.0, MRR: 0.0
INFO:root:question_id:kem34fmbzuggp2h9 MAP: 0.0, MRR: 0.0
INFO:root:question_id:vtnxyjw

{'map@1': tensor(0.3437), 'mrr@1': tensor(0.3437), '%Answers found@1': tensor(0.3437), '%Relevant embeddings@1': tensor(0.3437)}
{'map@2': tensor(0.4056), 'mrr@2': tensor(0.4056), '%Answers found@2': tensor(0.4676), '%Relevant embeddings@2': tensor(0.3066)}
{'map@3': tensor(0.4267), 'mrr@3': tensor(0.4320), '%Answers found@3': tensor(0.5466), '%Relevant embeddings@3': tensor(0.2854)}
{'map@4': tensor(0.4363), 'mrr@4': tensor(0.4468), '%Answers found@4': tensor(0.6059), '%Relevant embeddings@4': tensor(0.2707)}
{'map@5': tensor(0.4416), 'mrr@5': tensor(0.4563), '%Answers found@5': tensor(0.6535), '%Relevant embeddings@5': tensor(0.2587)}





In [2]:
from backend.evals.torchmetrics_eval_script import eval_retrieval_metrics
top_k=5
retrieval_metrics = eval_retrieval_metrics("../results/dev_results.json", top_k=top_k)
print(f'The retrieval metrics @{top_k} are {retrieval_metrics}')

INFO:root:Preparing data for retrieval metrics
3809it [00:00, 207624.78it/s]
  0%|          | 0/3809 [00:00<?, ?it/s]INFO:root:question_id:ctnk84gy3t6wdijv MAP: 0.25, MRR: 0.25
INFO:root:question_id:jmti4stsogolp49z MAP: 1.0, MRR: 1.0
INFO:root:question_id:el20m9l5wj8fq2om MAP: 0.3333333432674408, MRR: 0.3333333432674408
INFO:root:question_id:x3elyri8l6ol3zqr MAP: 1.0, MRR: 1.0
INFO:root:question_id:hezwq0unnhmezwlw MAP: 0.5, MRR: 0.5
INFO:root:question_id:i1matwosph3owqln MAP: 1.0, MRR: 1.0
INFO:root:question_id:hp3z0slx1qkejssd MAP: 1.0, MRR: 1.0
INFO:root:question_id:v883988zklkj8bfe MAP: 0.5, MRR: 0.5
INFO:root:question_id:8vv9g4kcjv90vbw7 MAP: 0.3333333432674408, MRR: 0.3333333432674408
INFO:root:question_id:wxoo1ii57ap6xerx MAP: 0.5, MRR: 0.5
INFO:root:question_id:jpfv8krobtqfpoox MAP: 0.0, MRR: 0.0
INFO:root:question_id:eplxx3whxemxs8oc MAP: 0.949999988079071, MRR: 1.0
INFO:root:question_id:hok8glgmaxuss2hq MAP: 0.5833333730697632, MRR: 0.5
INFO:root:question_id:kem34fmbzuggp2h9

The retrieval metrics @5 are {'map@5': tensor(0.4416), 'mrr@5': tensor(0.4563), '%Answers found@5': tensor(0.6535)}


In [4]:
import modules.document.utils.policyqa_parser as policyqa_parser

num_answers = policyqa_parser.count_answers("../data/dev.json")
num_answers

6044

In [1]:
from backend.modules.document.utils.policyqa_parser import count_questions

print(count_questions("../data/dev.json"))

3809
