In [25]:
import os
import json
import pandas as pd
from ast import literal_eval
from urllib.request import urlretrieve

from pathlib import Path
from grobid.models.form import Form, File
from grobid.models.response import Response
from grobid.client import Client, GrobidClientError
from grobid.tei import Parser

# Test Grobid

In [47]:
def grobid_parse(pdf_path):
    pdf_file = Path(pdf_path)
    with open(pdf_file, "rb") as file:
        form = Form(
            file=File(
                payload=file.read(),
                file_name=pdf_file.name,
                mime_type="application/pdf",
            )
        )
        c = Client(base_url="https://kermitt2-grobid.hf.space", form=form)
        try:
            xml_content = c.sync_request().content  # TEI XML file in bytes
            parser = Parser(xml_content)
            return parser.parse()
        except GrobidClientError as e:
            print(e)

pdf_path = '../data/raw/pdfs/conf_ircdl_GemelliVM23.pdf'
article = grobid_parse(pdf_path)

In [48]:
article.bibliography.authors

[Author(person_name=PersonName(surname='Gemelli', first_name='Andrea'), affiliations=[Affiliation(department="Dipartimento di Ingegneria dell'Informazione (DINFO", institution='Università degli studi di Firenze', laboratory=None)], email='andrea.gemelli@unifi.it'),
 Author(person_name=PersonName(surname='Vivoli', first_name='Emanuele'), affiliations=[Affiliation(department="Dipartimento di Ingegneria dell'Informazione (DINFO", institution='Università degli studi di Firenze', laboratory=None)], email='emanuele.vivoli@unifi.it'),
 Author(person_name=PersonName(surname='Marinai', first_name='Simone'), affiliations=[Affiliation(department="Dipartimento di Ingegneria dell'Informazione (DINFO", institution='Università degli studi di Firenze', laboratory=None)], email='simone.marinai@unifi.it')]

In [49]:
for a in article.bibliography.authors:
    fullname = a.person_name.first_name + ' ' +  a.person_name.surname
    orgs = [org.institution for org in a.affiliations]
    print(fullname, orgs)

Andrea Gemelli ['Università degli studi di Firenze']
Emanuele Vivoli ['Università degli studi di Firenze']
Simone Marinai ['Università degli studi di Firenze']


In [50]:
article.to_json()

'{"bibliography":{"title":"CTE: A Dataset for Contextualized Table Extraction","authors":[{"person_name":{"surname":"Gemelli","first_name":"Andrea"},"affiliations":[{"department":"Dipartimento di Ingegneria dell\'Informazione (DINFO","institution":"Università degli studi di Firenze","laboratory":null}],"email":"andrea.gemelli@unifi.it"},{"person_name":{"surname":"Vivoli","first_name":"Emanuele"},"affiliations":[{"department":"Dipartimento di Ingegneria dell\'Informazione (DINFO","institution":"Università degli studi di Firenze","laboratory":null}],"email":"emanuele.vivoli@unifi.it"},{"person_name":{"surname":"Marinai","first_name":"Simone"},"affiliations":[{"department":"Dipartimento di Ingegneria dell\'Informazione (DINFO","institution":"Università degli studi di Firenze","laboratory":null}],"email":"simone.marinai@unifi.it"}],"date":null,"ids":null,"target":null,"publisher":null,"journal":null,"series":null,"scope":null},"keywords":["Scientific paper analysis","Dataset","Benchmark","

In [28]:
json.loads(article.to_json())

{'bibliography': {'title': 'Will Open Science Change Authorship for Good? Towards a Quantitative Analysis',
  'authors': [{'person_name': {'surname': 'Mannocci', 'first_name': 'Andrea'},
    'affiliations': [{'department': 'Institute of Information Science and Technologies',
      'institution': 'CNR-ISTI -National Research Council "Alessandro Faedo"',
      'laboratory': None}],
    'email': 'andrea.mannocci@isti.cnr.it'},
   {'person_name': {'surname': 'Irrera', 'first_name': 'Ornella'},
    'affiliations': [{'department': 'Institute of Information Science and Technologies',
      'institution': 'CNR-ISTI -National Research Council "Alessandro Faedo"',
      'laboratory': None},
     {'department': 'Department of Information Engineering',
      'institution': 'University of Padova',
      'laboratory': None}],
    'email': 'ornella.irrera@studenti.unipd.it'},
   {'person_name': {'surname': 'Manghi', 'first_name': 'Paolo'},
    'affiliations': [{'department': 'Institute of Information

# Apply in batch

In [20]:
df = pd.read_pickle('../data/processed/papers.pkl')
df.head()

Unnamed: 0,key,doi,url,ee,year,authors,title,venue,pages,length,type,access
0,conf/ircdl/AmbrosinoAPS23,,https://dblp.org/rec/conf/ircdl/AmbrosinoAPS23,https://ceur-ws.org/Vol-3365/short16.pdf,2023,"[Maria Anna Ambrosino, Vanja Annunziata, Maria...",The Hetor project: a joint effort to co-create...,IRCDL,216-224,8.0,Conference and Workshop Papers,open
1,conf/ircdl/BagchiD23,,https://dblp.org/rec/conf/ircdl/BagchiD23,https://ceur-ws.org/Vol-3365/short1.pdf,2023,"[Mayukh Bagchi, Subhashis Das]",Disentangling Domain Ontologies.,IRCDL,17-28,11.0,Conference and Workshop Papers,open
2,conf/ircdl/BaglioniMPBM23,,https://dblp.org/rec/conf/ircdl/BaglioniMPBM23,https://ceur-ws.org/Vol-3365/paper2.pdf,2023,"[Miriam Baglioni, Andrea Mannocci, Gina Pavone...",(Semi)automated disambiguation of scholarly re...,IRCDL,47-59,12.0,Conference and Workshop Papers,open
3,conf/ircdl/BarbutiBRCP23,,https://dblp.org/rec/conf/ircdl/BarbutiBRCP23,https://ceur-ws.org/Vol-3365/short17.pdf,2023,"[Nicola Barbuti, Mauro De Bari, Stefania Riso,...",The Open Memory Apulia Project. How Open Data ...,IRCDL,225-232,7.0,Conference and Workshop Papers,open
4,conf/ircdl/BardiBM23,,https://dblp.org/rec/conf/ircdl/BardiBM23,https://ceur-ws.org/Vol-3365/short12.pdf,2023,"[Alessia Bardi, Margo Bargheer, Paolo Manghi]",A Discovery Hub for Diamond Open Access publis...,IRCDL,162-166,4.0,Conference and Workshop Papers,open


In [57]:
for index, row in df[~df.ee.isna()].iterrows():
    pdf_path = '../data/raw/pdfs/' + row['key'].replace('/', '_') + '.pdf'
    json_path = '../data/processed/grobid/' + row['key'].replace('/', '_') + '.json'
    print(pdf_path)
    if os.path.exists(json_path):
        print('... done! skipping')
        continue
        
    article = grobid_parse(pdf_path)
    with open(json_path, 'w') as f:
        f.write(article.to_json())
        print('...done!')

../data/raw/pdfs/conf_ircdl_AmbrosinoAPS23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_BagchiD23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_BaglioniMPBM23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_BarbutiBRCP23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_BardiBM23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_BernasconiCM23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_CastellanoSV23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_CastellucciC23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_DeglInnocentiCC23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_FraisseBB23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_GemelliVM23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_LocaputoPC023.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_MartogliaBRVSV23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_MatviichukHP23.pdf
... done! skipping
../data/raw/pdfs/conf_ircdl_McGillivrayCBPF23.pdf
... done! skipping
../data/raw/pdfs/con