# Prepare jsonl files with combined data (label + description + aliases + ...) for entities and predicates to build bm25 index

In [None]:
import json
import re
import os
from tqdm.auto import tqdm

In [None]:
WIKIDATA_FILES_PATH = 'data/wikidata_dump/processed_dump'

### 1. Filter entities with less than 10 relations and/or no descriptions/aliases.

In [None]:
labels_ids = set()
descriptions_ids = set()
aliases_ids = set()

path = f'{WIKIDATA_FILES_PATH}/labels'
for filename in tqdm(os.listdir(path)):
    with open(f'{path}/{filename}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            qid = data['qid']
            labels_ids.add(qid)

path = f'{WIKIDATA_FILES_PATH}/descriptions'
for filename in tqdm(os.listdir(f'{WIKIDATA_FILES_PATH}/descriptions')):
    with open(f'{path}/{filename}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            qid = data['qid']
            descriptions_ids.add(qid)

path = f'{WIKIDATA_FILES_PATH}/aliases'
for filename in tqdm(os.listdir(path)):
    with open(f'{path}/{filename}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            qid = data['qid']
            aliases_ids.add(qid)

In [None]:
# find entities without descriptions and aliases

description_nans = labels_ids.difference(descriptions_ids)
aliases_nans = labels_ids.difference(aliases_ids)

In [None]:
len(description_nans), len(aliases_nans)

In [None]:
# count relations for all entities

relations_counts = {}
path = f'{WIKIDATA_FILES_PATH}/entity_rels'

for filename in tqdm(os.listdir(path)):
    with open(f'{path}/{filename}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            qid = data['qid']
            relations_counts[qid] = relations_counts.get(qid, 0) + 1

In [None]:
min(relations_counts.values())

In [None]:
# we will keep ids with 10 or more relations

popular = [x for x, y in relations_counts.items() if y >= 10]
popular_ids = labels_ids.intersection(set(popular))

In [None]:
# we will also keep all entities from datasets

datasets_files = ['data/preprocessed/lcquad_2.0/lcquad_2.0_test.json',
                  'data/preprocessed/lcquad_2.0/lcquad_2.0_train.json',
                  'data/preprocessed/pat/pat_test.json',
                  'data/preprocessed/pat/pat_train.json',
                  'data/preprocessed/qald/qald_test.json',
                  'data/preprocessed/qald/qald_train.json',
                  'data/preprocessed/rubq/rubq_test.json',
                  'data/preprocessed/rubq/rubq_train.json']

dataset_ids = set()

for file in tqdm(datasets_files):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        all_ids = data['entities']
        for i in all_ids:
            dataset_ids.add(i)

len(dataset_ids)

In [None]:
# keep only popular ids with descriptions or ids from datasets

ids_to_keep = labels_ids.intersection(descriptions_ids)
ids_to_keep = ids_to_keep.intersection(popular_ids)
ids_to_keep = ids_to_keep.union(dataset_ids)

In [None]:
description_nans = None

with open('description_nan_ids.txt', 'r', encoding='utf-8') as f:
    description_nans = set(f.read().splitlines())

len(description_nans)

In [None]:
def create_folder_iterator(folder_path):
    """Iterate through all files from the folder in the order of numbers"""
    files = sorted(os.listdir(folder_path), key=lambda x: int(x.split('.')[0]))
    for file in files:
        with open(f'{folder_path}/{file}', 'r') as f:
            for line in f:
                item = json.loads(line.strip())
                yield item

In [None]:
# Iterate through all files and find descriptions and labels for all entities from ids_to_keep
# Save all to jsonl files to build bm25 index using pyserini
# Here, we take advantage of the fact that all IDs in the files appear in the same order. The same IDs always appear in a row, and they only occur once for all files.
# This code can be modified to add any other information about entities to combined data

iterator_aliases = create_folder_iterator(f'{WIKIDATA_FILES_PATH}/aliases')
iterator_descriptions = create_folder_iterator(f'{WIKIDATA_FILES_PATH}/descriptions')

labels_files = sorted(os.listdir(f'{WIKIDATA_FILES_PATH}/labels'), key=lambda x: int(x.split('.')[0]))

current_aliases_data = None
current_descriptions_data = None
current_entity_data = None

save_folder_path = 'data/combined_data'
path = f'{WIKIDATA_FILES_PATH}/labels'

os.mkdir(save_folder_path)

file_num = 0
ids_data = []

n_iter = 0

for file in tqdm(labels_files):
    with open(f'{path}/{file}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            qid, label = data['qid'], data['label']

            if qid not in ids_to_keep:
                continue
            
            aliases = []
            description = None
            entities = set()
            
            # descriptions
            if qid not in description_nans:
                while True:
                    try:
                        current_descriptions_data = next(iterator_descriptions)
                    except StopIteration:
                        break
                    
                    if current_descriptions_data['qid'] != qid:
                        continue
                    elif current_descriptions_data['qid'] == qid:
                        description = current_descriptions_data['description']
                        break
            else:
                description = ''

            if description is None:
                raise RuntimeError(f'Description for id {qid} not found')
            
            aliases_found = False

            # aliases
            while True:
                if current_aliases_data is None:
                    try:
                        current_aliases_data = next(iterator_aliases)
                    except StopIteration:
                        break
                
                if current_aliases_data['qid'] != qid:
                    if aliases_found:
                        break
                    else:
                        try:
                            current_aliases_data = next(iterator_aliases)
                        except StopIteration:
                            break
                        continue
                elif current_aliases_data['qid'] == qid:
                    aliases_found = True
                    aliases.append(current_aliases_data['alias'])
                    try:
                        current_aliases_data = next(iterator_aliases)
                    except StopIteration:
                        break
                    continue

            if aliases == []:
                raise RuntimeError(f'Aliases for id {qid} not found')
            
            id_result = f'{label} {description}'

            for a in aliases:
                id_result += f' {a}'

            for e in entities:
                id_result += f' {relations_descriptions[e]}'

            ids_data.append({'id': qid, 'contents': id_result})

            if len(ids_data) == 10000:
                output_path = f'{save_folder_path}/{file_num}.jsonl'
                file_num += 1
                with open(output_path, 'w') as jsonl_file:
                    for entry in ids_data:
                        jsonl_file.write(json.dumps(entry) + '\n')
                ids_data = []

if len(ids_data) > 0:
    output_path = f'{save_folder_path}/{file_num}.jsonl'
    file_num += 1
    with open(output_path, 'w') as jsonl_file:
        for entry in ids_data:
            jsonl_file.write(json.dumps(entry) + '\n')
    ids_data = []

In [None]:
descriptions_data = {}
path = f'{WIKIDATA_FILES_PATH}/descriptions'
for filename in tqdm(os.listdir(f'{WIKIDATA_FILES_PATH}/descriptions')):
    with open(f'{path}/{filename}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            qid = data['qid']
            if qid not in ids_to_keep:
                continue
            descriptions_data[qid] = data['description'] 

In [None]:
with open('data/wikidata_relations_info.json', 'r', encoding='utf-8') as f:
    relations_data = json.load(f)

In [None]:
# add information about relations and related entities descriptions

data_files = sorted(os.listdir('data/combined_data'), key=lambda x: int(x.split('.')[0]))

file_num = 0
ids_data = []

save_folder_path = 'data/combined_data_with_rels'
path = 'data/combined_data'

os.mkdir(save_folder_path)

iterator_entity_rels = create_folder_iterator(f'{WIKIDATA_FILES_PATH}/entity_rels')

current_rels_data = None
current_qid, current_pid, current_value = None, None, None

file_num = 0
ids_data = []

n_iter = 0

for file in tqdm(data_files):
    with open(f'{path}/{file}', 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            qid, content = data['id'], data['contents']
            if qid in nan_ids:
                ids_data.append({'id': qid, 'contents': content})
                continue

            data_found = False
            predicates = []
            entities = []
            while True:
                if current_rels_data is None:
                    try:
                        current_rels_data = next(iterator_entity_rels)
                        current_qid = current_rels_data['qid']
                        current_pid = current_rels_data['pid']
                        current_value = current_rels_data['value']
                    except StopIteration:
                        break
                
                if current_qid != qid:
                    if data_found:
                        break
                    else:
                        try:
                            current_rels_data = next(iterator_entity_rels)
                            current_qid = current_rels_data['qid']
                            current_pid = current_rels_data['pid']
                            current_value = current_rels_data['value']
                        except StopIteration:
                            break
                        continue
                elif current_qid == qid:
                    data_found = True
                    predicates.append(current_pid)
                    entities.append(current_value)
                    try:
                        current_rels_data = next(iterator_entity_rels)
                        current_qid = current_rels_data['qid']
                        current_pid = current_rels_data['pid']
                        current_value = current_rels_data['value']
                    except StopIteration:
                        break
                    continue

            if predicates == [] and entities == []:
                raise ValueError(f'Error {qid}')

            p_text = []

            for p in predicates:
                if p in relations_data:
                    p_label = relations_data[p]['label']
                    p_desc = relations_data[p]['description']
                    if not (p_label is None):
                        p_text.append(p_label)
                    if not (p_desc is None):
                        p_text.append(p_desc)
            
            if p_text:
                content = f'{content} {' '.join(p_text)}'

            q_text = []
            for q in entities:
                 if q in descriptions_data:
                    q_text.append(descriptions_data[q])
            
            if q_text:
                content = f'{content} {' '.join(q_text)}'

            ids_data.append({'id': qid, 'contents': content})

        
        output_path = f'{save_folder_path}/{file_num}.json'
        file_num += 1
        with open(output_path, 'w') as jsonl_file:
            for entry in ids_data:
                jsonl_file.write(json.dumps(entry) + '\n')
        ids_data = []

## Build combined data for Predicates

For each predicate, we save its label, description, aliases, and questions from the datasets in which it appears.

In [None]:
with open('data/preprocessed/rubq/rubq_train.json', 'r', encoding='utf-8') as f:
    rubq_data = json.load(f)

with open('data/preprocessed/qald/qald_train.json', 'r', encoding='utf-8') as f:
    qald_data = json.load(f)

with open('data/preprocessed/pat/pat_train.json', 'r', encoding='utf-8') as f:
    pat_data = json.load(f)

with open('data/preprocessed/lcquad_2.0/lcquad_2.0_train.json', 'r', encoding='utf-8') as f:
    lcquad_data = json.load(f)

In [None]:
rubq_data = rubq_data['dataset']
qald_data = qald_data['dataset']
pat_data = pat_data['dataset']
lcquad_data = lcquad_data['dataset']

In [None]:
predicates_questions = {}

In [None]:
for entity in rubq_data:
    relations = []
    if entity['relations']['question']:
        for r in entity['relations']['question']:
            if r not in relations:
                relations.append(r)
    if entity['relations']['query']:
        for r in entity['relations']['query']:
            if r not in relations:
                relations.append(r)
    for r in relations:
        if r not in predicates_questions:
            predicates_questions[r] = set()
        predicates_questions[r].add(entity['en_question'])

for entity in qald_data:
    relations = []
    if entity['relations']['question']:
        for r in entity['relations']['question']:
            if r not in relations:
                relations.append(r)
    if entity['relations']['query']:
        for r in entity['relations']['query']:
            if r not in relations:
                relations.append(r)
    for r in relations:
        if r not in predicates_questions:
            predicates_questions[r] = set()
        predicates_questions[r].add(entity['en_question'])

for entity in pat_data:
    relations = []
    if entity['relations']['question']:
        for r in entity['relations']['question']:
            if r not in relations:
                relations.append(r)
    if entity['relations']['query']:
        for r in entity['relations']['query']:
            if r not in relations:
                relations.append(r)
    for r in relations:
        if r not in predicates_questions:
            predicates_questions[r] = set()
        predicates_questions[r].add(entity['en_question'])

for entity in lcquad_data:
    relations = []
    if entity['relations']['question']:
        for r in entity['relations']['question']:
            if r not in relations:
                relations.append(r)
    if entity['relations']['query']:
        for r in entity['relations']['query']:
            if r not in relations:
                relations.append(r)
    for r in relations:
        if r not in predicates_questions:
            predicates_questions[r] = set()
        predicates_questions[r].add(entity['en_question'])

In [None]:
with open('data/wikidata_relations_info.json', 'r', encoding='utf-8') as f:
    predicates_data = json.load(f)

In [None]:
predicates_full = {}

In [None]:
cnt = 0
for p in predicates_data:
    info = f'{predicates_data[p]['label']} {predicates_data[p]['description']} {' '.join(predicates_data[p]['aliases'])}'
    if p in predicates_questions:
        info += f' {' '.join(predicates_questions[p])}'
    else:
        cnt += 1
    predicates_full[p] = info

In [None]:
predicates_jsonl = []

for key in tqdm(predicates_full):
    predicates_jsonl.append({'id': key, 'contents': predicates_full[key]})

In [None]:
with open('data/combined_predicates_data/0.jsonl', 'w') as jsonl_file:
    for entry in predicates_jsonl:
        jsonl_file.write(json.dumps(entry) + '\n')