In [None]:
import os
import re
import glob
import pickle
from tqdm import tqdm
import pandas as pd
import xml.etree.ElementTree as ET

import multiprocessing

In [None]:
base_filename = "/dataset/pubmed/pubmed23n*.xml"
output_path = '/dataset/pubmed_df_year'
# files = sorted(glob.glob(base_filename), reverse=True)
files = glob.glob(base_filename)

In [None]:
class Counter(object):
    def __init__(self):
        self.val = multiprocessing.Value('i', 0)

    def increment(self, n=1):
        with self.val.get_lock():
            self.val.value += n

    @property
    def value(self):
        return self.val.value

In [None]:


def parse_xml(filename, output_file):
    global counter
    df = pd.DataFrame(columns = ['title', 'abstract', 'doi', 'first_author', 'year'])
    with open(filename, encoding="utf-8") as f:
        # Parse the XML content
        tree = ET.parse(f)
        root = tree.getroot()
        
        # Process the XML data as needed
        # (This is just an example; adapt based on your specific needs)
        for article in root.findall(".//Article"):  # Adjust this based on the XML structure

            year_elem = article.find('.//PubDate/Year')
            if year_elem is None:
                year_elem = article.find('.//PubDate/MedlineDate')
            
            abstract_elem = article.findall('.//AbstractText')
            title_elem = article.find("ArticleTitle")
            first_author_elem = article.findall('.//Author')
            
            if len(abstract_elem) == 0 or title_elem is None:
                continue
            
            
            doi = ''
            abstract = ''
            first_author = ''
            year = ''
            title = title_elem.text
            if year_elem is not None:
                year = year_elem.text[:4]
            
            for x in abstract_elem:
                if x.text is None:
                    continue
                if 'Label' in x.attrib:
                    abstract += x.attrib['Label'] + ": "
                abstract += x.text + ' '            
                
            doi_elem = article.findall('.//ELocationID')
            if doi_elem is not None:
                for x in doi_elem:
                    if x.attrib['EIdType'] == 'doi':
                        doi = x.text               

            if len(first_author_elem) == 0 or first_author_elem[0].find('LastName') is None:
                continue
            first_author = first_author_elem[0].find('LastName').text

            df = pd.concat([df, pd.DataFrame.from_records([{
                'title': title, 'abstract': abstract, 'doi': doi, 'first_author': first_author, 'year': year
            }])])
    df.to_pickle(output_file)
    counter.increment()
    print(f'{counter.value}/{len(files)} saved: {output_file}')


In [None]:
# Loop through all files
counter = Counter()
pool = multiprocessing.Pool(processes=4)
processes = []
for filename in tqdm(files):
    
    filenumber = os.path.basename(filename).split('.')[0]
    output_file = os.path.join(output_path, f'{filenumber}.pkl')
    if os.path.exists(output_file):
        continue

    processes.append((filename, output_file))
#     p = multiprocessing.Process(target=parse_xml, args=(filename,output_file,))
#     processes.append(p)
#     p.start()

output_List = pool.starmap(parse_xml, processes)
pool.close()
pool.join()