In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import os
from tqdm import tqdm

In [5]:
dhq_df = pd.read_csv("../datasets/private_data/dhq_data.csv")

In [9]:
dhq_df['DHQarticle-id'].nunique()

683

In [17]:
# Define the XML namespaces
namespaces = {
    'tei': "http://www.tei-c.org/ns/1.0",
    'dhq': "http://www.digitalhumanities.org/ns/dhq",
    'xml': "http://www.w3.org/XML/1998/namespace"
}
directory_path = "../datasets/private_data/dhq/articles/"
all_data = []
# xml_files = [os.path.join(directory_apth, file_name) for file_name in os.listdir(directory_apth) if file_name.endswith(".xml")]

exclude = ['old', 'converted', 'dhq', 'sample', 'recovered', 'test', 'walsh']

# List only XML files
xml_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory_path) for f in filenames 
             if f.endswith('.xml') 
             and not any(ex_str in f for ex_str in exclude) 
             and not f.startswith('999')]

len(xml_files)

723

In [21]:
def process_xml_files(xml_files, output_path):
    if os.path.exists(output_path):
        final_df = pd.read_csv(output_path)
    else:
        for file_name in tqdm(xml_files, desc="Processing XML files"):
        
            with open(file_name, 'r', encoding='utf-8') as f:
                file_content = f.read().strip()
                
            if file_content == '<?xml version="1.0" encoding="UTF-8"?>':
                continue  # Skip to the next file if the content is just the XML declaration
            
            try:
                tree = ET.parse(file_name)
                root = tree.getroot()
            except ET.ParseError:
                print(f"Error parsing {file_name}. Skipping...")
                continue

            # Extracting the required data with checks to avoid errors
            base_data = {
                'DHQarticle-id': root.find(".//tei:publicationStmt/tei:idno[@type='DHQarticle-id']", namespaces=namespaces).text if root.find(".//tei:publicationStmt/tei:idno[@type='DHQarticle-id']", namespaces=namespaces) is not None else None,
                'volume': root.find(".//tei:publicationStmt/tei:idno[@type='volume']", namespaces=namespaces).text if root.find(".//tei:publicationStmt/tei:idno[@type='volume']", namespaces=namespaces) is not None else None,
                'issue': root.find(".//tei:publicationStmt/tei:idno[@type='issue']", namespaces=namespaces).text if root.find(".//tei:publicationStmt/tei:idno[@type='issue']", namespaces=namespaces) is not None else None,
                'articleType': root.find(".//tei:publicationStmt/dhq:articleType", namespaces=namespaces).text if root.find(".//tei:publicationStmt/dhq:articleType", namespaces=namespaces) is not None else None,
                'date_when': root.find(".//tei:publicationStmt/tei:date", namespaces=namespaces).text if root.find(".//tei:publicationStmt/tei:date", namespaces=namespaces) is not None else None,
                'dhq_keywords': root.find(".//tei:encodingDesc/tei:classDecl/tei:taxonomy[@xml:id='dhq_keywords']/tei:bibl", namespaces=namespaces).text if root.find(".//tei:encodingDesc/tei:classDecl/tei:taxonomy[@xml:id='dhq_keywords']/tei:bibl", namespaces=namespaces) is not None else None,
                'language_ident': root.find(".//tei:profileDesc/tei:langUsage/tei:language", namespaces=namespaces).attrib['ident'] if root.find(".//tei:profileDesc/tei:langUsage/tei:language", namespaces=namespaces) is not None else None,
                'dhq_abstract': root.find(".//tei:text/tei:front/dhq:abstract/tei:p", namespaces=namespaces).text if root.find(".//tei:text/tei:front/dhq:abstract/tei:p", namespaces=namespaces) is not None else None,
                'file_name': file_name
            }

            # Extract title
            title_element = root.find(".//tei:titleStmt/tei:title", namespaces=namespaces)
            if title_element is not None:
                # Concatenate all text and tail components of the element and its descendants
                title_parts = [title_element.text] + [e.text + (e.tail if e.tail else "") for e in title_element.findall(".//")]
                base_data['title'] = "".join(filter(None, title_parts))



            # Extract author information
            author_elements = root.findall(".//tei:titleStmt/dhq:authorInfo", namespaces=namespaces)

            authors_data = []

            for author_element in author_elements:
                author_data = {}
                
                # Extract author name
                author_name_element = author_element.find("dhq:author_name", namespaces=namespaces)
                if author_name_element is not None:
                    first_name = author_name_element.text
                    last_name_element = author_name_element.find("dhq:family", namespaces=namespaces)
                    if last_name_element is not None:
                        full_name = f"{first_name} {last_name_element.text}".strip()
                    else:
                        full_name = first_name
                    author_data['author_name'] = full_name

                # Extract affiliation
                affiliation_element = author_element.find("dhq:affiliation", namespaces=namespaces)
                if affiliation_element is not None:
                    author_data['affiliation'] = affiliation_element.text

                # Extract email
                email_element = author_element.find("email", namespaces=namespaces)
                if email_element is not None:
                    author_data['email'] = email_element.text

                # Extract bio
                bio_element = author_element.find("dhq:bio/tei:p", namespaces=namespaces)
                if bio_element is not None:
                    author_data['bio'] = ''.join(bio_element.itertext()).strip()
                
                authors_data.append(author_data)

            base_data['authors'] = authors_data

            # Extracting paragraphs from the body
            # Check if paragraphs are inside a <div> tag
            # Get the <body> element
            body_element = root.find(".//tei:text/tei:body", namespaces=namespaces)

            # Extract all text from the <body> element and its descendants
            body_text = ''.join(body_element.itertext()).strip()

            base_data['body_text'] = body_text

            # Then, instead of creating a separate dataframe for paragraphs, you can directly append the base_data dictionary to the all_data list:
            data_df = pd.DataFrame([base_data])

            all_data.append(data_df)

        # Convert the data list to a DataFrame
        final_df = pd.concat(all_data)
        final_df.to_csv(output_path, index=False)
    return final_df

In [40]:

directory_path = "../datasets/private_data/dhq/articles/"
all_data = []
# xml_files = [os.path.join(directory_apth, file_name) for file_name in os.listdir(directory_apth) if file_name.endswith(".xml")]

exclude = ['old', 'converted', 'dhq', 'sample', 'recovered', 'test', 'walsh']

# List only XML files
xml_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory_path) for f in filenames 
             if f.endswith('.xml') 
             and not any(ex_str in f for ex_str in exclude) 
             and not f.startswith('999')]


df = process_xml_files(xml_files, '../datasets/private_data/dhq/reprocessed_dhq_data.csv')

Processing XML files: 100%|██████████| 723/723 [00:01<00:00, 410.46it/s]


In [122]:
final_df = process_xml_files(xml_files, '../data/original_journal_datasets/dhq/dhq_data.csv')

# Save the dataframe to a CSV file
# final_df.to_csv('../data/original_journal_datasets/dhq/dhq_data.csv', index=False)


Processing XML files: 100%|██████████| 683/683 [00:01<00:00, 617.34it/s]


In [41]:
df = df.sort_values(by=['DHQarticle-id'])

In [42]:
df[df.volume.isna()]['DHQarticle-id'].unique()

array(['000432', '000488', '000564', '000573', '000580', '000650',
       '000657', '000664', '000677 ', '000709', '000712', '000714',
       '000715', '000716', None], dtype=object)

In [39]:
df[df['DHQarticle-id'] == "000424"]

Unnamed: 0,DHQarticle-id,volume,issue,articleType,date_when,dhq_keywords,language_ident,dhq_abstract,file_name,title,authors,body_text
0,424,,,article,,DHQ classification scheme; full list available...,en,Estudos recentes estão conseguindo demonstrar ...,../datasets/private_data/dhq/articles/000424/0...,SIG e história da arquitetura. Avances no\n ...,[{'author_name': 'Patricia  ...,Introdução\n Os Sistemas de Inf...


In [143]:
final_df.loc[final_df.file_name == "../data/dhq_data/000664.xml", "volume"] = "016"
final_df.loc[final_df.file_name == "../data/dhq_data/000664.xml", "issue"] = "4"
final_df.loc[final_df.file_name == "../data/dhq_data/000684.xml", "volume"] = "017"
final_df.loc[final_df.file_name == "../data/dhq_data/000684.xml", "issue"] = "2"
final_df.date_when = final_df.date_when.str.replace('Feburary', 'February')

In [144]:
final_df['volume'] = final_df.volume.astype(int)
final_df['issue'] = final_df.issue.astype(int)

In [145]:
final_df.isna().sum()

DHQarticle-id      0
volume             0
issue              0
articleType        0
date_when         12
dhq_keywords       0
language_ident     2
dhq_abstract      21
file_name          0
title              0
authors            0
body_text          0
dtype: int64

In [146]:
older_df.isna().sum()

DHQarticle-id      0
volume            15
issue             15
articleType        0
date              18
dhq_keywords       0
language_ident     2
dhq_abstract      24
file_name          0
title              0
authors            0
body_text          0
dtype: int64

In [147]:
older_df = older_df.rename(columns={'date_when': 'date'})

In [148]:
merged_df = pd.merge(final_df, older_df[['DHQarticle-id', 'date']], on='DHQarticle-id', how='outer')
merged_df = merged_df[merged_df.body_text.notna()]

In [149]:
merged_df.loc[(merged_df.date_when != merged_df.date) & (merged_df.date_when.isna()), 'date_when'] = merged_df.date

In [150]:
missing = merged_df.loc[(merged_df.date_when != merged_df.date) & (merged_df.date_when.isna())]

In [151]:
xml_links = pd.read_csv("../data/dhq_xml_links.csv")

In [152]:
xml_links['file_name'] = "../data/dhq_data/" + xml_links.xml_link.str.split('/').str[-1]

In [153]:
xml_links[xml_links.file_name.isin(missing.file_name)]

Unnamed: 0,xml_link,volume,issue,DHQarticle-id,file_name
46,/dhq/vol/17/2/000673.xml,17,2,673,../data/dhq_data/000673.xml
47,/dhq/vol/17/2/000673.xml,17,2,673,../data/dhq_data/000673.xml
48,/dhq/vol/17/2/000680.xml,17,2,680,../data/dhq_data/000680.xml
49,/dhq/vol/17/2/000680.xml,17,2,680,../data/dhq_data/000680.xml
102,/dhq/vol/16/4/000664.xml,16,4,664,../data/dhq_data/000664.xml
103,/dhq/vol/16/4/000664.xml,16,4,664,../data/dhq_data/000664.xml
152,/dhq/vol/16/2/000646.xml,16,2,646,../data/dhq_data/000646.xml
153,/dhq/vol/16/2/000646.xml,16,2,646,../data/dhq_data/000646.xml


In [154]:
merged_df['date_published'] = pd.to_datetime(merged_df['date_when'], errors='coerce')

In [155]:
missing

Unnamed: 0,DHQarticle-id,volume,issue,articleType,date_when,dhq_keywords,language_ident,dhq_abstract,file_name,title,authors,body_text,date
184,680,17.0,2.0,article,,DHQ classification scheme; full list available...,en,Southern Italian digital humanist Domenico Fio...,../data/dhq_data/000680.xml,"Language,\n Materiality, and Dig...","[{'author_name': 'Cristina Migliaccio', 'affi...",I shall try . . . to say something useful abou...,
334,646,16.0,2.0,article,,DHQ classification scheme; full list available...,en,,../data/dhq_data/000646.xml,Introduction: The\n Questions of...,"[{'author_name': 'Roopika Risam', 'affiliatio...",Minimal computing is the answer. Minimal compu...,
459,664,16.0,4.0,article,,DHQ classification scheme; full list available...,en,"The annual, international Digital Humanities c...",../data/dhq_data/000664.xml,Response to\n\t\t\t\t\t\tThe circus we deserve...,[{'author_name': 'The Alliance of Digital Huma...,Executive Summary \n\t\t\t\t\tWe thank DHQ for...,
498,673,17.0,2.0,article,,DHQ classification scheme; full list available...,en,"A novel, rule-based, automatic framework for i...",../data/dhq_data/000673.xml,Automatic\n Identification of Rh...,"[{'author_name': 'Heyam Abd Alhadi', 'affilia...","1. Introduction\n Rhetoric, balāgha...",


In [156]:
for _, row in missing.iterrows():
    print(row.file_name)
    vol = row['volume']
    issue = row['issue']
    print(vol, issue)
    date = merged_df[(merged_df.volume == vol) & (merged_df.issue == issue)].sort_values(by='date_published', ascending=False).iloc[0].date_published
    print(date)
    merged_df.loc[(merged_df.file_name == row.file_name), 'date_published'] = date

../data/dhq_data/000680.xml
17.0 2.0
2023-07-20 00:00:00
../data/dhq_data/000646.xml
16.0 2.0
2022-06-25 00:00:00
../data/dhq_data/000664.xml
16.0 4.0
2022-10-14 00:00:00
../data/dhq_data/000673.xml
17.0 2.0
2023-07-20 00:00:00


In [157]:
merged_df.isna().sum()

DHQarticle-id      0
volume             0
issue              0
articleType        0
date_when          4
dhq_keywords       0
language_ident     2
dhq_abstract      21
file_name          0
title              0
authors            0
body_text          0
date               4
date_published     0
dtype: int64

In [158]:
merged_df.to_csv('../data/original_journal_datasets/dhq/dhq_data.csv', index=False)