In [1]:
import datetime
import json
import requests
import os
from tqdm import tqdm

In [3]:
def get_ai_snippets_for_query(query, num_web_results=20):
    headers = {"X-API-Key": os.environ["YOU_API_KEY"]}
    params = {"query": query,
              "num_web_results": num_web_results
              }
    results = requests.get(
        f"https://api.ydc-index.io/search",
        params=params,
        headers=headers,
    ).json()

    return results

In [4]:
trial_run = get_ai_snippets_for_query("reasons to smile")

In [8]:
trial_run['hits'][0]

{'description': 'Smiling has benefits, even when you’re faking it. Learn more about why and how smiling can change your mood, stress levels, and social interactions for the better.',
 'snippets': ['Such smiles have been shown to increase cortisol (stress hormone) levels in people they are directed toward. ... Perhaps the most compelling reason to smile is that it may lengthen your overall lifespan. One study found that genuine, intense smiling is associated with longer life.',
  'What does psychology have to say about smiling? Whether your smile is genuine or not, it can act on your body and mind in a variety of positive ways, offering benefits for your health, your mood, and even the moods of people around you.',
  'Smiling can also boost your overall health by helping your immune system to function more effectively. It is thought that when you smile, immune function improves because you are more relaxed (thanks to the release of certain neurotransmitters).',
  "We are naturally drawn

## Run search over FActScore queries

In [13]:
labeled_prompts = []
with open("grounded-decoding/data/FActScore/labeled/prompt_entities.txt") as fin:
    for line in fin:
        labeled_prompts.append(line.strip())
unlabeled_prompts = []
with open("grounded-decoding/data/FActScore/unlabeled/prompt_entities.txt") as fin:
    for line in fin:
        unlabeled_prompts.append(line.strip())

In [14]:
len(labeled_prompts), len(unlabeled_prompts)

(183, 500)

In [16]:
labeled_prompts[:10]

['Kang Ji-hwan',
 'Kalki Koechlin',
 'William Post',
 'William Waldegrave, Baron Waldegrave of North Hill',
 'Darrius Heyward-Bey',
 'Andrew Shue',
 'Wahida Prism Khan',
 'Waka Flocka Flame',
 'Focus...',
 'Sara Paxton']

In [17]:
unlabeled_prompts[:10]

['Suthida',
 'Miguel Ángel Félix Gallardo',
 'Iggy Azalea',
 'Fernando da Costa Novaes',
 'Jan Zamoyski',
 'Radhika Apte',
 'David Galloway (writer)',
 'Cheyenne Brando',
 'Mihai Eminescu',
 'John Atkinson Grimshaw']

In [18]:
prompt_template = "Tell me a bio of {}."

In [20]:
prompt_template.format(labeled_prompts[0])

'Tell me a bio of Kang Ji-hwan.'

In [24]:
labeled_search_results = []

In [25]:
for one_query in tqdm(labeled_prompts[:10]):
    ex_you_search_results = get_ai_snippets_for_query(prompt_template.format(one_query))
    ex_you_search_results["datetime"] = datetime.datetime.now().isoformat()
    # one_query["raw_search_results"] = ex_you_search_results
    # list_of_search_results = []
    # for one_hit in ex_you_search_results["hits"]:
    #     list_of_search_results.append({"title": one_hit["title"], "snippet": one_hit["description"]})
    #     list_of_search_results.extend([{"title": one_hit["title"], "snippet": one_snippet} for one_snippet in one_hit["snippets"]])
    # one_query["search_results"] = list_of_search_results
    labeled_search_results.append(ex_you_search_results)

100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


In [29]:
len(labeled_search_results)

10

In [32]:
for one_query in tqdm(labeled_prompts[10:]):
    ex_you_search_results = get_ai_snippets_for_query(prompt_template.format(one_query))
    ex_you_search_results["datetime"] = datetime.datetime.now().isoformat()
    # one_query["raw_search_results"] = ex_you_search_results
    # list_of_search_results = []
    # for one_hit in ex_you_search_results["hits"]:
    #     list_of_search_results.append({"title": one_hit["title"], "snippet": one_hit["description"]})
    #     list_of_search_results.extend([{"title": one_hit["title"], "snippet": one_snippet} for one_snippet in one_hit["snippets"]])
    # one_query["search_results"] = list_of_search_results
    labeled_search_results.append(ex_you_search_results)

100%|██████████| 173/173 [03:31<00:00,  1.22s/it]


In [33]:
len(labeled_search_results)

183

In [38]:
if not os.path.isdir("grounded-decoding/data/FActScore/search_results/"):
    os.makedirs("grounded-decoding/data/FActScore/search_results/")

In [42]:
with open("grounded-decoding/data/FActScore/search_results/labeled_you_search.jsonl", 'w') as fout:
    for ex_id, (ex_prompt, ex_you_search_results) in enumerate(zip(labeled_prompts, labeled_search_results)):
        ex_you_search_results_out = ex_you_search_results.copy()
        ex_you_search_results_out['query'] = prompt_template.format(ex_prompt)
        ex_you_search_results_out['id'] = ex_id
        print('n_snippets:', sum([len(one_hit["snippets"]) for one_hit in ex_you_search_results_out['hits']]))
        fout.write(json.dumps(ex_you_search_results_out) + '\n')

n_snippets: 63
n_snippets: 69
n_snippets: 69
n_snippets: 39
n_snippets: 28
n_snippets: 67
n_snippets: 53
n_snippets: 53
n_snippets: 61
n_snippets: 58
n_snippets: 38
n_snippets: 26
n_snippets: 57
n_snippets: 55
n_snippets: 52
n_snippets: 63
n_snippets: 19
n_snippets: 65
n_snippets: 28
n_snippets: 19
n_snippets: 52
n_snippets: 65
n_snippets: 26
n_snippets: 54
n_snippets: 26
n_snippets: 40
n_snippets: 57
n_snippets: 28
n_snippets: 29
n_snippets: 62
n_snippets: 29
n_snippets: 23
n_snippets: 66
n_snippets: 56
n_snippets: 58
n_snippets: 60
n_snippets: 28
n_snippets: 75
n_snippets: 64
n_snippets: 69
n_snippets: 35
n_snippets: 63
n_snippets: 42
n_snippets: 74
n_snippets: 25
n_snippets: 42
n_snippets: 59
n_snippets: 15
n_snippets: 36
n_snippets: 60
n_snippets: 50
n_snippets: 31
n_snippets: 26
n_snippets: 63
n_snippets: 29
n_snippets: 54
n_snippets: 65
n_snippets: 31
n_snippets: 17
n_snippets: 56
n_snippets: 48
n_snippets: 32
n_snippets: 38
n_snippets: 47
n_snippets: 50
n_snippets: 43
n_snippets

In [43]:
unlabeled_search_results = []

In [44]:
for one_query in tqdm(unlabeled_prompts):
    ex_you_search_results = get_ai_snippets_for_query(prompt_template.format(one_query))
    ex_you_search_results["datetime"] = datetime.datetime.now().isoformat()
    unlabeled_search_results.append(ex_you_search_results)

100%|██████████| 500/500 [10:07<00:00,  1.21s/it]


In [45]:
len(unlabeled_search_results)

500

In [47]:
with open("grounded-decoding/data/FActScore/search_results/unlabeled_you_search.jsonl", 'w') as fout:
    for ex_id, (ex_prompt, ex_you_search_results) in enumerate(zip(unlabeled_prompts, unlabeled_search_results)):
        ex_you_search_results_out = ex_you_search_results.copy()
        ex_you_search_results_out['query'] = prompt_template.format(ex_prompt)
        ex_you_search_results_out['id'] = ex_id
        print('n_snippets:', sum([len(one_hit["snippets"]) for one_hit in ex_you_search_results_out['hits']]))
        fout.write(json.dumps(ex_you_search_results_out) + '\n')

n_snippets: 55
n_snippets: 57
n_snippets: 63
n_snippets: 28
n_snippets: 33
n_snippets: 68
n_snippets: 46
n_snippets: 60
n_snippets: 46
n_snippets: 59
n_snippets: 24
n_snippets: 31
n_snippets: 38
n_snippets: 69
n_snippets: 74
n_snippets: 38
n_snippets: 61
n_snippets: 38
n_snippets: 56
n_snippets: 38
n_snippets: 66
n_snippets: 48
n_snippets: 75
n_snippets: 40
n_snippets: 39
n_snippets: 54
n_snippets: 35
n_snippets: 29
n_snippets: 61
n_snippets: 61
n_snippets: 53
n_snippets: 49
n_snippets: 27
n_snippets: 59
n_snippets: 51
n_snippets: 64
n_snippets: 60
n_snippets: 47
n_snippets: 55
n_snippets: 34
n_snippets: 55
n_snippets: 22
n_snippets: 42
n_snippets: 31
n_snippets: 66
n_snippets: 44
n_snippets: 62
n_snippets: 54
n_snippets: 52
n_snippets: 58
n_snippets: 42
n_snippets: 73
n_snippets: 51
n_snippets: 34
n_snippets: 71
n_snippets: 52
n_snippets: 47
n_snippets: 62
n_snippets: 41
n_snippets: 48
n_snippets: 70
n_snippets: 69
n_snippets: 64
n_snippets: 57
n_snippets: 35
n_snippets: 55
n_snippets

## Explore search results

In [2]:
data_files = {'labeled': "grounded-decoding/data/FActScore/search_results/labeled_you_search.jsonl",
              'unlabeled': "grounded-decoding/data/FActScore/search_results/unlabeled_you_search.jsonl"
              }

In [3]:
viz_examples = []
with open(data_files['labeled']) as fin:
    for line in fin:
        viz_examples.append(json.loads(line))

In [4]:
import numpy as np
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
os.environ["HF_API_KEY"] = "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"

In [6]:
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b", use_auth_token=os.environ["HF_API_KEY"])

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [25]:
from urllib.parse import urlparse

retained_domains = set()
for viz_id, viz_example in enumerate(viz_examples):
    retained_domains.update(
        set(
            [urlparse(one_hit['url']).netloc for one_hit in viz_example['hits'] 
             if not any([restricted_dom in one_hit['url'] for restricted_dom in ("wikipedia", "wikidata", "wikiwand")
                         ]
                         )
                         ][:HITS_PER_QUERY]
                         )
    )

In [26]:
[one_url for one_url in sorted(retained_domains) if 'wiki' in one_url]

['aidwiki.com',
 'asianwiki.com',
 'biowikis.com',
 'birthdaywiki.com',
 'bornwiki.com',
 'commons.wikimedia.org',
 'famousfacewiki.com',
 'handwiki.org',
 'idolwiki.com',
 'playerswiki.com',
 'sportzwiki.com',
 'thereaderwiki.com',
 'wikibio.in',
 'wikibio.us',
 'wikibiography.in',
 'wikibionet.com',
 'wikifamouspeople.com',
 'wikipura.com',
 'wikispro.com',
 'wikivisually.com',
 'www.celebsagewiki.com',
 'www.wikifame.org',
 'www.wikilogy.com',
 'www.wikistaar.com',
 'www.wikitree.com',
 'www.wikizero.com']

In [39]:
SNIPPETS_PER_HIT = 1
HITS_PER_QUERY = 10
sufficient_hits, context_sizes = [], []

for viz_id, viz_example in enumerate(viz_examples):
    batch = []
    filtered_hits = [one_hit for one_hit in viz_example['hits']
                     if not any([restricted_dom in one_hit['url'] for restricted_dom in ("wikipedia", "wikidata", "wikiwand")
                                 ]
                               )
                    ]
    sufficient_hits.append(len(filtered_hits) >= HITS_PER_QUERY)
    for one_hit in filtered_hits[:HITS_PER_QUERY]:
        # print(one_hit['description'])
        # batch.append("snippet: " + one_hit['description'] + '\n')
        # print("---")
        if len(one_hit['snippets']) == 0:
            batch.append("snippet: " + one_hit['description'] + '\n')
        else:
            for one_snippet in one_hit['snippets'][:SNIPPETS_PER_HIT]:
                # print(one_snippet)
                batch.append("snippet: " + one_snippet + '\n')
            # print('+++\n')
    if len(batch) < min(len(filtered_hits), HITS_PER_QUERY)*SNIPPETS_PER_HIT:
        print(viz_example['query'])
        print([len(one_hit['snippets']) for one_hit in filtered_hits[:HITS_PER_QUERY]])
        print(json.dumps(filtered_hits, indent=2))
        break
    try:
        tokenized_batch = tokenizer(batch)
    except:
        print(viz_example['query'])
        print([(one_hit['title'], one_hit['url']) for one_hit in viz_example['hits']], batch)
        print(json.dumps(filtered_hits[:HITS_PER_QUERY], indent=2))
        continue
    batch_lens = [len(one_tok_ids) for one_tok_ids in tokenized_batch['input_ids']]
    context_sizes.append(np.sum(batch_lens))

Tell me a bio of Fernando (footballer, born 1984).
[('Fernando (footballer, born 1987) - Wikipedia', 'https://en.wikipedia.org/wiki/Fernando_(footballer,_born_1987)'), ('Fernando (footballer, born 1992) - Wikipedia', 'https://en.wikipedia.org/wiki/Fernando_(footballer,_born_1992)'), ('Fernando (footballer, born 1984) - Wikipedia', 'https://en.wikipedia.org/wiki/Fernando_(footballer,_born_1984)'), ('Fernando (footballer, born September 1999) - Wikipedia', 'https://en.wikipedia.org/wiki/Fernando_(footballer,_born_September_1999)'), ('Wikiwand - Fernando (footballer, born 1992)', 'https://www.wikiwand.com/en/Fernando_(footballer,_born_1992)'), ('Fernandão (footballer, born 1987) - Wikipedia', 'https://en.wikipedia.org/wiki/Fernand%C3%A3o_(footballer,_born_1987)'), ('Wikiwand - Fernando (footballer, born 1987)', 'https://www.wikiwand.com/en/Fernando_(footballer,_born_1987)'), ('Fernando - Wikidata', 'https://www.wikidata.org/wiki/Q1254357'), ('Fernandão (footballer, born 1978) - Wikipedia'

In [40]:
np.mean(context_sizes)

824.2032967032967