In [1]:
import yaml, os, random

In [2]:
INDIC_FILENAME = 'indic_samples.yml'
NON_INDIC_FILENAME = 'non_indic_samples.yml'

In [3]:
indic_samples = {}
non_indic_samples = {}
all_samples = {}

In [None]:
PATH_TO_DATA = ""
PATH_TO_INDIC_ANNOTATIONS = ""

In [None]:
for file in os.listdir(PATH_TO_DATA):
    if file.endswith('.yml'):
        with open(f'{PATH_TO_DATA}/{file}') as f:
            data = yaml.load(f, Loader=yaml.FullLoader)
            data['question'] = data['question'].replace('\n', ' ')
            data['question'] = data['question'].replace('\\', '')
            all_samples[file] = data

In [5]:
all_samples

{'KViz Show S2E1 Q1.yml': {'metadata': '',
  'question': 'The phrase X (a 4 word pharse) is believed to have originated in the 18th century in England, where it was used to describe the most beautiful and popular young woman at a dance. The first word is a French word meaning "beautiful," and it was commonly used in English to describe an attractive and charming woman. The phrase gained popularity in the 19th century and was commonly used in Victorian society to refer to a woman who was the center of attention and admiration at a social gathering. Today, it is still used to describe a woman who is the most beautiful and admired person at an event or gathering.',
  'question_source': 'https://www.youtube.com/watch?v=xpFEALbxHwQ&t=55s',
  'question_title': 'KViz Show S2E1 Q1',
  'themes': ['Fashion and Style'],
  'variable_specific_rationale': {'X': 'The phrase "Belle of the Ball" is said to have originated in 18th century in England, where it was used to describe the most beautiful and 

In [None]:
indic_annotation = {}

for file in os.listdir(f'{PATH_TO_INDIC_ANNOTATIONS}'):
    if file.endswith('.yml'):
        with open(f'{PATH_TO_INDIC_ANNOTATIONS}/{file}') as f:
            data = yaml.load(f, Loader=yaml.FullLoader)
            if data['indic'] == 'No':
                non_indic_samples[file] = all_samples[file]
            else:
                indic_samples[file] = all_samples[file]

In [7]:
indic_files = list(indic_samples.keys())
non_indic_files = list(non_indic_samples.keys())

In [8]:
def construct_word_ngrams(text, n):
    ngrams = []
    words = text.split()
    for i in range(len(words) - n + 1):
        ngrams.append(' '.join(words[i:i+n]))
    return ngrams

In [9]:
indic_questions = {}
non_indic_questions = {}

for file in indic_files:
    indic_questions[file] = {}
    indic_questions[file]['question'] = indic_samples[file]['question']
    indic_questions[file]['5_grams'] = construct_word_ngrams(indic_samples[file]['question'], 5)
    indic_questions[file]['10_grams'] = construct_word_ngrams(indic_samples[file]['question'], 10)

for file in non_indic_files:
    non_indic_questions[file] = {}
    non_indic_questions[file]['question'] = non_indic_samples[file]['question']
    non_indic_questions[file]['5_grams'] = construct_word_ngrams(non_indic_samples[file]['question'], 5)
    non_indic_questions[file]['10_grams'] = construct_word_ngrams(non_indic_samples[file]['question'], 10)

In [59]:
import requests
from concurrent.futures import ThreadPoolExecutor

def get_counts(phrase):
    indexes = {
        'Dolma-v1.7 (Llama-2)': 'v4_dolma-v1_7_llama',
        'RedPajama (Llama-2)': 'v4_rpj_llama_s4',
        'Pile-train (Llama-2)': 'v4_piletrain_llama',
        'C4-train (Llama-2)': 'v4_c4train_llama',
        # 'Dolma-v1.6-sample (Llama-2)': 'v4_dolma-v1_6-sample_llama',
        # 'Dolma-v1.6-sample (OLMo)': 'v4_dolmasample_olmo',
        'Pile-val (Llama-2)': 'v4_pileval_llama',
        # 'Pile-val (GPT-2)': 'v4_pileval_gpt2'
    }
    
    def send_request(index):
        print(f'Sending request to index: {index}')
        payload = {
            'index': index,
            'query_type': 'count',
            'query': phrase
        }
        response = requests.post('https://api.infini-gram.io/', json=payload)
        return response.json()

    # Parallelize the requests using ThreadPoolExecutor
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(send_request, indexes.values()))

    return results


In [60]:
def parallel_get_counts(ls_ques):
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(get_counts, entries['question']) for q, entries in ls_ques.items()]
        res = [future.result() for future in futures]
    return res

In [61]:
res = parallel_get_counts(indic_questions)

Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4tra

In [62]:
# dump res as pickle
import pickle
with open('indic_questions_counts.pkl', 'wb') as f:
    pickle.dump(res, f)

In [63]:
res_non_indic = parallel_get_counts(non_indic_questions)

Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v1_7_llama
Sending request to index: v4_rpj_llama_s4
Sending request to index: v4_piletrain_llama
Sending request to index: v4_c4train_llama
Sending request to index: v4_pileval_llama
Sending request to index: v4_dolma-v

In [64]:
with open('non_indic_questions_counts.pkl', 'wb') as f:
    pickle.dump(res_non_indic, f)