# Preprocessing Records

Takes the `json` files provided by ADS and processes them for Chroma database insertion as three lists:
* documents (title + abstract + body) [should we include authors/affiliations?]
* metadatas (the original `json`)
* ids

In [None]:
import os
import json

PATH_TO_DATA = 'data/json/'
FILENAMES = os.listdir(PATH_TO_DATA)
data = dict()
for filename in FILENAMES:
    with open(f'{PATH_TO_DATA}/{filename}', 'r') as file:
        data[os.path.splitext(os.path.basename(filename))[0]] = json.load(file)

print("Found files:")
for filename in data:
    print(f"  {filename}")


def preprocess_papers(papers):
    # Convert titles from list to string
    for paper in papers:
        paper['title'] = paper['title'][0]
    return papers


def construct_document(record, fields):
    """
    Construct a document from the specified fields
    """
    return "\n".join([record[field] for field in fields])


def prep_metadata(record):
    """
    JSONify any list or dict fields, as Chroma requires all metadata to be primitive
    """
    return {key: json.dumps(value) if isinstance(value, (list, dict)) else value for key, value in record.items()}


data = {key: preprocess_papers(value) for key, value in data.items()}

for journal, papers in data.items():
    documents = [construct_document(paper, ['title', 'abstract', 'body']) for paper in papers]
    metadatas = [prep_metadata(paper) for paper in papers]
    ids = [paper['id'] for paper in papers]
    # Write out to json file
    with open(f'data/processed_for_chroma/{journal}.json', 'w') as file:
        json.dump({'documents': documents, 'metadatas': metadatas, 'ids': ids}, file)
    print(f"Processed {journal}")



Found files:
  Earth_Science_Reviews
  Earth_Science_Research
  Planetary_Research
  Planetary_Reviews
  Astro_Reviews
  Astro_Research
Processed Earth_Science_Reviews
Processed Earth_Science_Research
Processed Planetary_Research
Processed Planetary_Reviews
Processed Astro_Reviews
Processed Astro_Research
