In [1]:
import zstd
import json
import pandas as pd
from tqdm import tqdm
from glob import glob

from datetime import datetime

In [2]:
def field_to(field:str, to:str, X:dict):
    d = X.get(field, [])
    X[field] = [x[to] for x in d if to in x]

def xfm(m:dict):
    m = m.copy()
    field_to('dois', 'value', m)
    return m


schema = {
    'control_number': int,
    'report_numbers': set,
    'journal_title_variants': set,
    'citation_count': int,
    'citation_count_without_self_citations': int,
    'author_count': int,
    'keywords': set,
    'titles': set,
    'dois': set,
    'inspire_categories': set,
    'document_type': set,
    'earliest_date': datetime,
    'published': bool,
    'citeable': bool,
}

def minimize(m:dict):
    r = {}
    for k,v in schema.items():
        if k == 'earliest_date':
            value = m.get('earliest_date', '0001-01-01')
            if len(value) == 4:
                value = value + '-01-01'
            if len(value) == 7:
                value = value + '-01'
            value = datetime.strptime(value, '%Y-%m-%d')
            r[k] = value
            continue
        value = m.get(k, v())
        if isinstance(value, list):
            # value = [x.replace('<sp>', '') for x in value]
            # value = f"<sp>{'<sp>'.join(value)}<sp>"
            value = set(value)
        r[k] = value
    return r

subjects = ["Experiment-HEP", "Phenomenology-HEP", "Experiment-Nucl", "Instrumentation", "Theory-Nucl", "Astrophysics", "Lattice", "Theory-HEP", "Other", "General Physics", "Computing", "Accelerators", "Data Analysis and Statistics", "Gravitation and Cosmology", "Quantum Physics", "Condensed Matter", "Math and Math Physics"]

In [3]:
fnames = glob('../../../data/inspirehep/*/*.json.zst')
master_index = {}
with tqdm(total=len(fnames)) as pbar:
    for fname in fnames:
        pbar.set_description(f'Loading {fname.split("/")[-1]}')
        with open(fname, 'rb') as f:
            d = json.loads(zstd.loads(f.read()).decode('utf-8'))
        for m in d:
            mm = xfm(m)
            mm = minimize(mm)
            control_number = mm['control_number']
            if control_number not in master_index:
                master_index[control_number] = mm
        pbar.update(1)


Loading 6.json.zst: 100%|██████████| 102/102 [01:11<00:00,  1.42it/s]


In [4]:
keys = next(master_index.values().__iter__()).keys()
df = pd.DataFrame.from_records(list(master_index.values()), columns=keys, index='control_number')

In [None]:
from io import BytesIO
buf = BytesIO()
with open('data.pkl.zst', 'wb') as f:
    df.to_pickle(buf)
    f.write(zstd.dumps(buf.getvalue()))