In [1]:
import xml.etree.ElementTree as ET

In [25]:
def extract_full_text_from_element(element):
    """Extracts full text from XML content given an XML element."""
    paragraphs = element.findall(".//p")
    # Use itertext() to extract all text, including that inside nested tags
    return "\n\n".join([''.join(para.itertext()) for para in paragraphs])


def extract_data_from_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    article_data = {}
    
    # Extracting Journal Metadata
    journal_title = root.find(".//journal-title")
    article_data["journal_title"] = journal_title.text if journal_title is not None else None

    journal_id = root.find(".//journal-id")
    article_data["journal_id"] = journal_id.text if journal_id is not None else None

    issn = root.find(".//issn")
    article_data["issn"] = issn.text if issn is not None else None

    abbrev_journal_title = root.find('.//abbrev-journal-title')
    article_data["abbrev_journal_title"] = abbrev_journal_title.text if abbrev_journal_title is not None else None
    
    publisher_name = root.find(".//publisher-name")
    article_data["publisher_name"] = publisher_name.text if publisher_name is not None else None

    pub_date_month = root.find('.//pub-date/month')
    article_data["pub_date_month"] = pub_date_month.text if pub_date_month is not None else None
    
    pub_date_day = root.find('.//pub-date/day')
    article_data["pub_date_day"] = pub_date_day.text if pub_date_day is not None else None
    
    pub_date_year = root.find('.//pub-date/year')
    article_data["pub_date_year"] = pub_date_year.text if pub_date_year is not None else None

    
    # Extracting Article Metadata
    article_title = root.find(".//article-title")
    article_data["article_title"] = article_title.text if article_title is not None else None

    # Extracting Authors
    authors = root.findall(".//contrib[@contrib-type='author']/name")
    author_list = []
    for author in authors:
        first_name = author.find('given-names').text if author.find('given-names') is not None else ""
        last_name = author.find('surname').text if author.find('surname') is not None else ""
        author_list.append(f"{first_name} {last_name}")
    article_data["authors"] = author_list

    # Extracting Keywords
    keywords = root.findall(".//kwd")
    keyword_list = [kwd.text for kwd in keywords if kwd.text]
    article_data["keywords"] = keyword_list

    # Extracting Abstract
    abstract_paragraphs = root.findall(".//abstract/p")
    abstract_text = "\n".join([para.text for para in abstract_paragraphs if para.text])
    article_data["abstract"] = abstract_text
    
    # Extracting Full Text
    body_element = root.find(".//body")
    if body_element is not None:
        full_text = extract_full_text_from_element(body_element)
        article_data["full_text"] = full_text
    else:
        article_data["full_text"] = None
    
    return article_data

In [26]:
def pretty_print_article_data(article_data):
    print("---- ARTICLE DATA ----")
    for key, value in article_data.items():
        if isinstance(value, list):  # For authors and keywords
            print(f"<b>{key.capitalize()}:</b>")
            for item in value:
                print(f"  - {item}")
        else:
            print(f"{key.capitalize()}: {value}")
    print("----------------------")

In [27]:
# Example:
file_path = r"C:/Users/ccovascosta/Downloads/1807-0337-soc-24-61-0198.xml"
data = extract_data_from_xml(file_path)
pretty_print_article_data(data)

---- ARTICLE DATA ----
Journal_title: Sociologias
Journal_id: soc
Issn: 1517-4522
Abbrev_journal_title: Sociologias
Publisher_name: Programa de Pós-Graduação em Sociologia - UFRGS
Pub_date_month: 0
Pub_date_day: 0
Pub_date_year: 2023
Article_title: Dois casos na disputa paradigmática do trabalho de socioeducador
<b>Authors:</b>
  - Rosalvo Negreiros de Oliveira
  - Marcílio Dantas Brandão
<b>Keywords:</b>
  - sistema socioeducativo
  - socioeducadores
  - teoria dos afetos
  - sociologia configuracional
  - socio-educational system
  - social educators
  - theory of the affects
  - figurational sociology
Abstract: Inquietados com a experiência de Antônio Carlos Gomes da Costa que, no auge da ditadura militar, implementou uma inovadora ideia de atendimento educacional de adolescentes em privação de liberdade, resgatamos seu relato dessa experiência em Minas Gerais e o cotejamos ao processo de constituição recente da categoria de trabalho de socioeducadores no Ceará, para realizar uma an