In [None]:
import pandas as pd
import pandas_gbq
import glob
import ast
import copy
import json

In [None]:
def process_chunk(Ti):

    ids = []
    genre = []
    pub_date = []
    lang = []
    title = []
    summary = []
    venue = []
    issn = []
    duplicates = []

    pid = []
    a_givenName = []
    a_familyName = []
    a_lifeSpan = []
    a_ORCID = []
    a_rank = []

    s_pid = []
    s_label = []
    s_lang = []

    i_pid = []
    i_doi = []
    i_isi = []
    i_uri = []
    i_pmid = []

    s = list(Ti['entry'])
    si = [json.loads(se) for se in s]
    for t in si:
        tt = t['master']
        ids += [tt['@id']]
        genre += [tt['instanceOf']['genreForm'][0]['@id']]
        lang += [tt['instanceOf']['language'][0]['code']]
        title += [tt['instanceOf']['hasTitle'][0]['mainTitle']]
        #summary += [tt['instanceOf']['summary'][0]['label']]
        try:
            venue += [tt['partOf'][0]['hasTitle'][0]['mainTitle']]
        except:
            venue += ['']
        #try:
        #    issn += [tt['partOf'][0]['hasInstance']['extent'][0]['label']]
        #except:
        #    issn += ['']
        try:
            pub_date += [tt['publication'][0]['date']]
        except:
            pub_date += ['']
        duplicates += [t['publication_count']]

        authors = tt['instanceOf']['contribution']
        for ia, a in enumerate(authors):
            pid += [tt['@id']]
            try:
                a_givenName += [a['agent']['givenName']]
            except:
                a_givenName += ['']
            try:
                a_familyName += [a['agent']['familyName']]
            except:
                a_familyName += ['']
            try:
                a_lifeSpan += [a['agent']['lifeSpan']]
            except:
                a_lifeSpan += ['']
            a_rank += [ia]
            try:
                for a_ids in a['agent']['identifiedBy']:
                    if a_ids['@type'] == 'ORCID':
                        a_ORCID += [a_ids['value']]
                        break
            except:
                a_ORCID += ['']
            if len(a_ORCID) < len(pid):
                a_ORCID += ['']

        subjects = tt['instanceOf']['subject']
        for subs in subjects:
            s_pid += [tt['@id']]
            s_label += [subs['prefLabel']]
            try:
                s_lang += [subs['language']['code']]
            except:
                s_lang += ['']

        try:
            identifiers = tt['identifiedBy']
            for pids in identifiers:
                i_pid += [tt['@id']]
                if pids['@type'] == 'URI':
                    i_uri += [pids['value']]
                else:
                    i_uri += ['']
                if pids['@type'] == 'ISI':
                    i_isi += [pids['value']]
                else:
                    i_isi += ['']
                if pids['@type'] == 'PMID':
                    i_pmid += [pids['value']]
                else:
                    i_pmid += ['']
                if pids['@type'] == 'DOI':
                    i_doi += [pids['value']]
                else:
                    i_doi += ['']
        except:
            pass

    publications = pd.DataFrame({'id':ids, 'genre': genre,
                                'pub_date': pub_date,
                                'lang': lang,
                                'title':title,
                                 #'summary':summary,
                                'venue': venue,
                                #'issn': issn,
                                'duplicates':duplicates})
    publications.to_gbq('publicdb_swepub.publications',if_exists='append')


    authorships = pd.DataFrame({'pub_id': pid,
                           'givenName': a_givenName,
                           'familyName':a_familyName,
                           'lifeSpan':a_lifeSpan,
                           'rank':a_rank,
                            'ORCID':a_ORCID})
    authorships.to_gbq('publicdb_swepub.authorships',if_exists='append')

    subjects = pd.DataFrame({'pub_id': s_pid,
                           'label': s_label,
                           'lang':s_lang})
    subjects.to_gbq('publicdb_swepub.subjects',if_exists='append')

    identifiers = pd.DataFrame({'pub_id': i_pid,
                           'doi': i_doi,
                           'isi': i_isi,
                           'pmid': i_pmid,
                           'uri': i_uri})
    identifiers.to_gbq('publicdb_swepub.identifiers',if_exists='append')

In [None]:
T = pd.read_csv('swepub-deduplicated.jsonl',
            sep='ÇÇÇÇ',
            header=None,
            engine='python',
            names = ['entry'],
            encoding='utf-8',
            chunksize=2000,
            )

for Ti in T:
    process_chunk(Ti)

100%|██████████| 1/1 [00:00<00:00, 11214.72it/s]
100%|██████████| 1/1 [00:00<00:00, 9822.73it/s]
100%|██████████| 1/1 [00:00<00:00, 12826.62it/s]
100%|██████████| 1/1 [00:00<00:00, 11066.77it/s]
100%|██████████| 1/1 [00:00<00:00, 14266.34it/s]
100%|██████████| 1/1 [00:00<00:00, 11066.77it/s]
100%|██████████| 1/1 [00:00<00:00, 12710.01it/s]
100%|██████████| 1/1 [00:00<00:00, 10485.76it/s]
100%|██████████| 1/1 [00:00<00:00, 11814.94it/s]
100%|██████████| 1/1 [00:00<00:00, 15033.35it/s]
100%|██████████| 1/1 [00:00<00:00, 15141.89it/s]
100%|██████████| 1/1 [00:00<00:00, 12336.19it/s]
100%|██████████| 1/1 [00:00<00:00, 8683.86it/s]
100%|██████████| 1/1 [00:00<00:00, 16320.25it/s]
100%|██████████| 1/1 [00:00<00:00, 14463.12it/s]
100%|██████████| 1/1 [00:00<00:00, 14979.66it/s]
100%|██████████| 1/1 [00:00<00:00, 14364.05it/s]
100%|██████████| 1/1 [00:00<00:00, 15827.56it/s]
100%|██████████| 1/1 [00:00<00:00, 9686.61it/s]
100%|██████████| 1/1 [00:00<00:00, 14926.35it/s]
100%|██████████| 1/1 [0

In [None]:
Ti = T.get_chunk()


100%|██████████| 1/1 [00:00<00:00, 10180.35it/s]
100%|██████████| 1/1 [00:00<00:00, 11096.04it/s]
100%|██████████| 1/1 [00:00<00:00, 10672.53it/s]
100%|██████████| 1/1 [00:00<00:00, 15087.42it/s]


In [None]:
len(i_pid)

9955