In [193]:
import requests
def pdf_convert_xml(pdf_link):
    GROBID_URL = 'http://localhost:8080'
    url = '%s/api/processHeaderDocument' % GROBID_URL
    pdf = requests.get(pdf_link)
    xml = requests.post(url, files={'input': pdf.content})
    return xml.text
xml = pdf_convert_xml('https://agritrop.cirad.fr/594895/1/Ahmed2020_Article_TheEffectOfCrossDirectionAndPl.pdf')


In [194]:
from lxml import etree
from six import text_type
NS = {'tei': 'http://www.tei-c.org/ns/1.0'}


def tei_to_dict(tei):
    parser = etree.XMLParser(encoding='UTF-8', recover=True)
    tei = tei if not isinstance(tei, text_type) else tei.encode('utf-8')
    root = etree.fromstring(tei, parser)

    result = {}

    abstract = get_abstract(root)
    if abstract and len(abstract) == 1:
        result['abstract'] = abstract[0].text

    authors = get_authors(root)
    if authors:
        result['authors'] = list(map(element_to_author, authors))

    keywords = get_keywords(root)
    if keywords and len(keywords) == 1:
        result['keywords'] = extract_keywords(keywords[0])

    title = get_title(root)
    if title and len(title) == 1:
        result['title'] = title[0].text

    references = get_references(root)
    if references:
        result['references'] = list(map(element_to_reference, references))

    return result


def element_to_author(el):
    result = {}

    name = []

    first = el.xpath('.//tei:persName/tei:forename[@type="first"]',
                     namespaces=NS)
    if first and len(first) == 1:
        name.append(first[0].text)

    middle = el.xpath('.//tei:persName/tei:forename[@type="middle"]',
                      namespaces=NS)
    if middle and len(middle) == 1:
        name.append(middle[0].text + '.')

    surname = el.xpath('.//tei:persName/tei:surname', namespaces=NS)
    if surname and len(surname) == 1:
        name.append(surname[0].text)

    result['name'] = ' '.join(name)

    affiliations = []
    for aff in el.xpath('.//tei:affiliation', namespaces=NS):
        for institution in aff.xpath('.//tei:orgName[@type="institution"]',
                                     namespaces=NS):
            affiliations.append({
                'value': institution.text
            })

    result['affiliations'] = affiliations

    return result


def extract_keywords(el):
    return [{'value': e.text} for e in el.xpath('.//tei:term', namespaces=NS)]


def element_to_reference(el):
    result = {}

    result['ref_title'] = extract_reference_title(el)

    result['authors'] = [
        element_to_author(e) for e in el.xpath('.//tei:author', namespaces=NS)
    ]

    result['journal_pubnote'] = extract_reference_pubnote(el)

    return result


def extract_reference_title(el):
    title = el.xpath(
        './/tei:analytic/tei:title[@level="a" and @type="main"]',
        namespaces=NS
    )
    if title and len(title) == 1:
        return title[0].text


def extract_reference_pubnote(el):
    result = {}

    journal_title = el.xpath('./tei:monogr/tei:title', namespaces=NS)
    if journal_title and len(journal_title) == 1:
        result['journal_title'] = journal_title[0].text

    journal_volume = el.xpath(
        './tei:monogr/tei:imprint/tei:biblScope[@unit="volume"]',
        namespaces=NS
    )
    if journal_volume and len(journal_volume) == 1:
        result['journal_volume'] = journal_volume[0].text

    journal_issue = el.xpath(
        './tei:monogr/tei:imprint/tei:biblScope[@unit="issue"]',
        namespaces=NS
    )
    if journal_issue and len(journal_issue) == 1:
        result['journal_issue'] = journal_issue[0].text

    year = el.xpath(
        './tei:monogr/tei:imprint/tei:date[@type="published"]/@when',
        namespaces=NS
    )
    if year and len(year) == 1:
        result['year'] = year[0]

    pages = []

    page_from = el.xpath(
        './tei:monogr/tei:imprint/tei:biblScope[@unit="page"]/@from',
        namespaces=NS
    )
    if page_from and len(page_from) == 1:
        pages.append(page_from[0])

    page_to = el.xpath(
        './tei:monogr/tei:imprint/tei:biblScope[@unit="page"]/@to',
        namespaces=NS
    )
    if page_to and len(page_to) == 1:
        pages.append(page_to[0])

    result['page_range'] = '-'.join(pages)

    return result


def get_abstract(root):
    return root.xpath('//tei:profileDesc/tei:abstract/tei:p', namespaces=NS)


def get_authors(root):
    return root.xpath('//tei:fileDesc//tei:author', namespaces=NS)


def get_keywords(root):
    return root.xpath('//tei:profileDesc/tei:textClass/tei:keywords', namespaces=NS)


def get_references(root):
    return root.xpath('//tei:text//tei:listBibl/tei:biblStruct', namespaces=NS)


def get_title(root):
    return root.xpath('//tei:titleStmt/tei:title', namespaces=NS)



In [196]:
dict.get('abstract')

"New citrus fruit varieties with the right pomological and organoleptic characteristics are expected by consumers and the fresh citrus fruit market. Apart from a good balance between sugar content and acidity, seedlessness is particularly demanded. Triploidy is one of the best ways to obtain seedless cultivars, and, taking advantage of diploid gametes, research programs have succeeded in creating them. Triploid hybrids are sterile and, when associated with parthenocarpy, produce seedless fruits. However, no studies have compared the potential agronomic interest of diploid and triploid cultivars to date. The aims of this study were to investigate the effects of (i) cross direction between diploid and triploid reciprocal populations of mandarin hybrids and (ii) the increase in ploidy level from diploidy to triploidy phenotypic variation in quantitative agronomic traits. Reciprocal crosses between 'Fortune' mandarin and 'Ellendale' tangor generated two diploid and two triploid populations

In [197]:
dict.get('keywords')

[{'value': 'Ploidy'},
 {'value': 'Mandarin'},
 {'value': 'Reciprocal crosses'},
 {'value': 'Dosage effect'},
 {'value': 'Phenotypic variation'}]

In [198]:
dict.get('title')

'The effect of cross direction and ploidy level on phenotypic variation of reciprocal diploid and triploid mandarin hybrids'

In [199]:
dict.get('authors')

[{'name': 'Dalel Ahmed',
  'affiliations': [{'value': 'INRA'}, {'value': 'University of Montpellier'}]},
 {'name': 'Jean-Charles Evrard', 'affiliations': []},
 {'name': 'Patrick Ollitrault', 'affiliations': []},
 {'name': 'Yann Froelicher', 'affiliations': []}]