# Preprocessing Records

Takes the `json` files provided by ADS and processes them for Chroma database insertion as three lists:
* documents (title + abstract + body) [should we include authors/affiliations?]
* metadatas (the original `json`)
* ids

In [5]:
import os
import json

PATH_TO_DATA = 'data/json/'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()
for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)

print("Found files:")
for filename in data:
    print(f"  {filename}")


def preprocess_papers(papers):
    # Convert titles from list to string
    for paper in papers:
        paper['title'] = paper['title'][0]
    return papers


def construct_document(record, fields):
    """
    Construct a document from the specified fields
    """
    return "\n".join([record[field] for field in fields])


def prep_metadata(record):
    """
    JSONify any list or dict fields, as Chroma requires all metadata to be primitive
    """
    return {key: json.dumps(value) if isinstance(value, (list, dict)) else value for key, value in record.items()}


data = {key: preprocess_papers(value) for key, value in data.items()}

for journal, papers in data.items():
    documents, metadatas, ids = [], [], []
    for paper in papers:
        if not {'title', 'abstract', 'body', 'reference'}.issubset(paper.keys()):
            print(f"Skipping {paper['id']} as it does not have all required fields")
            continue
        documents.append(construct_document(paper, ['title', 'abstract', 'body']))
        metadatas.append(prep_metadata(paper))
        ids.append(paper['id'])
    
    assert len(documents) == len(metadatas) == len(ids)
    # Write out to json file
    output_dir = 'reviews' if 'Reviews' in journal else 'research'
    out_path = f'data/processed_for_chroma/{output_dir}/{journal}.json'
    with open(out_path, 'w') as file:
        json.dump({'documents': documents, 'metadatas': metadatas, 'ids': ids}, file)
    print(f"Processed {journal} ({len(documents)} complete records)")


Found files:
  Earth_Science_Reviews
  Earth_Science_Research
  Planetary_Research
  Planetary_Reviews
  Astro_Reviews
  Astro_Research
Skipping 1476030 as it does not have all required fields
Skipping 1419360 as it does not have all required fields
Skipping 15624655 as it does not have all required fields
Skipping 17550234 as it does not have all required fields
Skipping 15821200 as it does not have all required fields
Processed Earth_Science_Reviews (995 complete records)
Processed Earth_Science_Research (1000 complete records)
Skipping 8399444 as it does not have all required fields
Skipping 9533180 as it does not have all required fields
Skipping 5928982 as it does not have all required fields
Skipping 8399536 as it does not have all required fields
Skipping 2675138 as it does not have all required fields
Skipping 8153960 as it does not have all required fields
Skipping 3950526 as it does not have all required fields
Skipping 441779 as it does not have all required fields
Skipping 