In [None]:
from google.cloud import storage
import os
import glob
import tqdm

In [None]:
 pip install xmltodict -q

In [None]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/MyDrive/D4G - ceebios/d4g-ceebios-a11dc912ca4a.json'
storage_client = storage.Client()
    
bucket = storage_client.get_bucket('d4g-ceebios-bdd')

In [None]:
from google.colab import auth
auth.authenticate_user()

AuthorizationError: ignored

In [None]:
!gcloud init

Welcome! This command will take you through the configuration of gcloud.

Settings from your current configuration [default] are:
component_manager:
  disable_update_check: 'True'
core:
  account: barkova.anastasia@gmail.com

Pick configuration to use:
 [1] Re-initialize this configuration [default] with new settings 
 [2] Create a new configuration
Please enter your numeric choice:  1

Your current configuration has been set to: [default]

You can skip diagnostics next time by using the following flag:
  gcloud init --skip-diagnostics

Network diagnostic detects and fixes local network connection issues.
Reachability Check passed.
Network diagnostic passed (1/1 checks passed).

Choose the account you would like to use to perform operations for this 
configuration:
 [1] barkova.anastasia@gmail.com
 [2] Log in with a new account
Please enter your numeric choice:  1

You are logged in as: [barkova.anastasia@gmail.com].

Pick cloud project to use: 
 [1] d4g-ceebios
 [2] youtube-12-11
 [3]

### Import functions and classes

In [None]:
import re
import xmltodict
import json
import xml.etree.ElementTree as ET
from pprint import pprint

# This code was written by Lucas Le Corvec
def _unique(sequence):
    """Function to remove duplicates while keeping inserterion order.
    """
    seen = set()
    return [x for x in sequence if not (x in seen or seen.add(x))]

def authors_list(value):
    """Function to return main author and other authors through a list of lists.
    """
    if not value:  # Means list is empty
        return '', []
    main_author = value[0][0] + ' ' + value[0][1]
    if len(value) > 1:
        others = [x[0] + ' ' + x[1] for x in value[1:]]
        return main_author, others
    else:
        return main_author, []


def _has_numbers(string):
    """Function to check if string has numeric character.
    """
    return any(char.isdigit() for char in string)

def _numeric_caracter_splitting(string):
    """Function to split figure reference that have a numeric character inside it.
    """
    if 'fig' in string.lower() and _has_numbers(string):
        return re.findall('\d*\D+', string)
    else:
        return string


def figure_ref_detection(text):
    """Function to find figure references in paragraph.
    """
    # Have to take care of figures ref separated by 'and' or/and by ','
    # Plus1 : sometimes '-' is used to reference several figures
    # Plus2 : have to remove any '(' or ')' ',' in figures names after regex

    # Taking care of splitted figure reference that has figure reference inside. Exemple : fig.4B
    # And then flattening the new list
    text_raw = [_numeric_caracter_splitting(x) for x in text.split(' ')]
    text_splitted = []
    for element_1 in text_raw:
        if type(element_1) == list:
            for element_2 in element_1:
                text_splitted.append(element_2)
        else:
            text_splitted.append(element_1)

    figure_ref = []
    for index, element in enumerate(text_splitted):
        if ('fig' in element.lower()) and (
                8 >= len(element) >= 3):  # All figure references "should" validate this condition ==> To verify
            try:
                element_to_add = text_splitted[index + 1]  # Get next element as it is the figure reference
                figure_ref.append(element_to_add)
            except IndexError:
                continue

            try:
                text_splitted[index + 2]  # Case 0 : when the figure reference is at the end of paragraph
            except IndexError:
                continue

            if element_to_add.endswith(','):  # Case 1 : element ends with ',' meaning an other figure ref follows
                new_counter = index + 2
                while text_splitted[new_counter].endswith(
                        ','):  # Case 1.1 : several figures references separated by ','
                    figure_ref.append(text_splitted[new_counter])
                    new_counter += 1
                    try:
                        text_splitted[
                            new_counter]  # If it fails it means that the next figure reference is at the end of paragraph
                    except:
                        continue
                if new_counter + 1 == len(text_splitted):  # Case 1.2 : last figure reference is at the end of paragraph
                    figure_ref.append(text_splitted[new_counter])
                    continue
                if text_splitted[new_counter + 1].lower() == 'and':
                    figure_ref.append(
                        text_splitted[new_counter])  # Case 1.3 : get last figure reference, separated by 'and'
                    figure_ref.append(text_splitted[new_counter + 2])
                else:
                    figure_ref.append(
                        text_splitted[new_counter])  # Case 1.4 : get last figure reference, separated by ','

            elif text_splitted[index + 2].lower() == 'and':  # Case 2 : only two figures references, separated by 'and'
                figure_ref.append(text_splitted[index + 3])

            else:  # Case 3 : only one figure reference
                continue  # Element already added

    figure_ref_cleaned_re = [re.sub(r'[,()\'´:"”’.;]', '', x) for x in figure_ref]  # Regex to clean figure references

    # Check if all figure references do have a numeric character and are no longer than 2 characters
    # ==> Maybe double check this rule ...
    for index, element in enumerate(figure_ref_cleaned_re):
        if not _has_numbers(
                element):  # Case 3 : when auhtor doesn't write the figure number for next figure reference. Exemple : fig. 3D, E and F
            figure_ref_cleaned_re[index] = figure_ref_cleaned_re[index - 1][0] + element

    figure_ref_cleaned_len = [x for x in figure_ref_cleaned_re if len(x) <= 3]

    return _unique(figure_ref_cleaned_len)  # In order to remove duplicates


def match_figure_ref(doi, figure_list, dict_figure):
    """Function that matches the figures reference from paragraphs to actual figures in
    dict_figures.
    """
    graphic_ref_list = []
    for element_1 in figure_list:
        # Strip anything that is not digit (including blank spaces)
        figure_ref = re.sub(r'[A-Za-z,()\'´:"”’.; +]', '', element_1)

        for element_2 in dict_figure[doi]:
            # Strip anything that is not digit (including blank spaces)
            graph_ref = re.sub(r'[A-Za-z,()\'´:"”’.; +]', '', element_2)
            # Matching figure_ref from paragraph to figure_ref from figures
            if str(figure_ref) == str(graph_ref):
                graphic_ref_list.append(dict_figure[doi][element_2]['graphic_ref'])

    return graphic_ref_list


## Code by Paul-Henri & Anastasia

def arborescence(filename, counter):
    dic = _file_to_dic(filename)
    d = {}
    _dic_to_keys_dic(dic, d, counter)
    pprint(d)


def _dic_to_keys_dic(dic, keys_dic, counter):
    counter += 1
    keys = dic.keys()
    for key in keys:
        if isinstance(dic[key], dict):
            keys_dic[key] = {}
            nested = dic[key]
            nested_keys = nested.keys()
            _dic_to_keys_dic(dic[key], keys_dic[key], counter)
            counter -= 1
        else:
            keys_dic[key] = counter

def _file_to_dic(f):
    with open(f, "rb") as file:
        document = xmltodict.parse(file)
    json_file_text = json.dumps(document)
    json_file = json.loads(json_file_text)
    dic = json_file['article']
    return dic


class Biorxiv_Plos_Parser:
    # This class can parse the following PLOS journals:
    # PLOS One, Genetics, Biology, Computational Biology, Clinical Trials, Neglected Tropical Diseases, Pathogens
    # And it also parses bioRxiv articles

    def __init__(self, xml, journal_type):
        self.xml = xml
        self.journal_type = journal_type

    def get_doi(self):
        # Transform the XML file into a parsable object
        tree = ET.parse(self.xml)
        root = tree.getroot()

        # Get the DOI
        doi = root.find(".//*[@pub-id-type='doi']").text
        return doi

    def get_paragraphs(self):
        # Transform the XML file into a parsable object
        tree = ET.parse(self.xml)
        root = tree.getroot()

        # Get the DOI
        doi = root.find(".//*[@pub-id-type='doi']").text

        # Get the paragraphs and their associated figures
        ## Lists of paragraphs and figures in the body + list of hashes
        figures = []
        hashes = []
        p = []

        ## Select all the paragraphs in the body with xpath (sec)
        e = root.findall(".//sec/p")

        ## Extract the paragraphs and figures
        for i in e:
            paragraph = "".join(i.itertext())
            p.append(paragraph)
            figures.append(figure_ref_detection(paragraph))
            hashes.append(hash(paragraph + doi))

        # Build the paragraph dictionary
        paragraphs = []
        doi = self.get_doi()

        for doc_id, paragraph, figure_list, id_ in zip(hashes, p, figures, hashes):
            if figure_list != []:
                temp_dic = {}
                temp_dic['content'] = paragraph.replace('\t', '').replace('\n', '')
                meta_dic = {}
                meta_dic['document_id'] = doc_id
                meta_dic['figures_ids'] = figure_list
                meta_dic['type'] = 'paragraph'
                meta_dic['doi'] = doi
                temp_dic['meta'] = meta_dic
                paragraphs.append(temp_dic)
            else:
                continue

        return paragraphs

    def get_article(self):
        # Transform the XML file into a parsable object
        tree = ET.parse(self.xml)
        root = tree.getroot()

        # Get the DOI
        doi = root.find(".//*[@pub-id-type='doi']").text

        # Get the Abstract
        abstract = []
        for r in root.findall(".//front/article-meta/abstract"):
            abstract_paragraph = "".join(r.itertext()).replace('\t', '').replace('\n', '')
            abstract.append(abstract_paragraph)

        abstract = ''.join(abstract)

        # Get the Title
        title = root.find(".//article-title").text

        # Get the Authors
        authors = []
        for a in root.findall(".//*[@contrib-type='author']/name/surname"):
            author = "".join(a.itertext())
            authors.append(author)

        # Get the journal
        journal = root.find(".//journal-title").text

        # Build the articles dictionary
        article = {}
        article['id'] = hash(doi + abstract)
        article['content'] = abstract

        meta_article = {}
        meta_article['title'] = title
        meta_article['authors'] = authors
        meta_article['journal'] = journal
        meta_article['doi'] = doi

        article['meta'] = meta_article

        return article

    def get_figures(self):

        # Transform the XML file into a parsable object
        tree = ET.parse(self.xml)
        root = tree.getroot()

        # Get the figures URLs
        figure_urls = []
        for r in root.findall(".//fig/object-id"):
            figure_url = "".join(r.itertext())
            figure_urls.append(figure_url)
            if self.journal_type == 'biorxiv':
                figure_urls = [f for f in figure_urls if 'biorxiv' in f]

        # Get the figures IDs
        ids = []
        for l in root.findall(".//fig/label"):
            label = "".join(l.itertext())
            if self.journal_type == 'plos':
                ids.append(label[-1])
            else:
                ids.append(label[-2])

        # Create a string of figure title + its caption. It sometimes can happen that the title or the caption
        # are not present. It will be filled with "Not available" text.
        captions = []
        for fig in root.findall(".//fig"):
            if fig.findall('caption/p') != []:
                c = fig.findall('caption/p')[0]
                caption = "".join(c.itertext()).replace('\t', '').replace('\n', '')
            else:
                caption = 'Caption not found'

            if fig.findall('caption/title') != []:
                t = fig.findall('caption/title')[0]
                title = "".join(t.itertext()).replace('\t', '').replace('\n', '')
            else:
                title = "Title not found"

            captions.append(title + '; ' + caption)

        # Build the figures dictionary
        figures = []
        for url, caption, id_ in zip(figure_urls, captions, ids):
            dic = {}
            dic['id'] = id_
            dic['content'] = caption

            meta_dic = {}
            meta_dic['url'] = url
            meta_dic['type'] = 'figure'
            meta_dic['fig_id'] = self.get_doi() + f'_{id_}'
            meta_dic['doi'] = self.get_doi()
            dic['meta'] = meta_dic
            figures.append(dic)

        return figures


### Test access to the bucket

In [None]:
bucket

<Bucket: d4g-ceebios-bdd>

In [None]:
blobs = list(bucket.list_blobs(prefix='raw_data/xml'))

In [None]:
len(blobs)

9531

In [None]:
!gsutil cp -r gs://d4g-ceebios-bdd/raw_data/items25-00-2022.json temp_file.json

Copying gs://d4g-ceebios-bdd/raw_data/items25-00-2022.json...
/ [1 files][ 30.4 MiB/ 30.4 MiB]                                                
Operation completed over 1 objects/30.4 MiB.                                     


In [None]:
# Copy all files to the directory xml from GCS
os.mkdir('xml')

In [None]:
def load(file_path):
    os.system(f'gsutil -m cp -r {file_path} xml')
    # tree = ET.parse('temp_file.xml')
    # root = tree.getroot()
    # os.system('rm temp_file.xml')
    # return tree, root

file = 'gs://d4g-ceebios-bdd/raw_data/xml'
load(file)
# tree, root

In [None]:
list_files = [f for f in os.listdir('xml/xml')]
print(len(list_files))

9530


In [None]:
file = json.load(open('/content/temp_file.json'))

### Parse the xml elements and save them

In [None]:
path_all = [file for file in glob.glob(os.path.join('/content/xml/xml', '*.xml'))]
journal_type = 'plos'

path_to_db_fig = '/content/xml/parsed_plos_fig'
path_to_db_para = '/content/xml/parsed_plos_para'

# Create directories to save the parsed files if the don't exist
if not os.path.exists(path_to_db_fig):
  os.mkdir(path_to_db_fig)
if not os.path.exists(path_to_db_para):
  os.mkdir(path_to_db_para)

not_parsed_figs = []
not_parsed_paras = []
for file in tqdm.tqdm(path_all):
    PLOS = Biorxiv_Plos_Parser(file, journal_type)
    doi = PLOS.get_doi()
    doi = doi.replace('/', '-')

    #If the xml cannot be parsed it will be added to the list, and the iteration continues
    try:
      figs = PLOS.get_figures()
  
    except:
      print(f'Could not parse figures for {file}')
      not_parsed_figs.append(file)
      continue

    #article_meta = PLOS.get_article()
    try:
      paragraphs = PLOS.get_paragraphs()
    except:
      print(f'Could not parse paragraphs for {file}')
      not_parsed_paras.append(file)
      continue

    # Save dicts as json
    with open(os.path.join(path_to_db_fig, f'{doi}_fig.json'), 'w') as f1:
        json.dump(figs, f1)

    with open(os.path.join(path_to_db_para, f'{doi}_paragraphs.json'), 'w') as f2:
        json.dump(paragraphs, f2)

  7%|▋         | 632/9530 [00:46<09:09, 16.20it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0001439.xml


 15%|█▍        | 1416/9530 [01:48<09:39, 14.01it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0017828.xml


 16%|█▌        | 1528/9530 [01:57<06:56, 19.20it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0018039.xml


 25%|██▌       | 2392/9530 [03:01<08:32, 13.93it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0018318.xml


 52%|█████▏    | 4963/9530 [06:15<04:31, 16.85it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0018277.xml


 53%|█████▎    | 5054/9530 [06:22<05:50, 12.77it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0018755.xml


 61%|██████▏   | 5853/9530 [07:28<03:18, 18.54it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0014799.xml


 62%|██████▏   | 5897/9530 [07:31<03:58, 15.20it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0017775.xml


 74%|███████▍  | 7084/9530 [09:16<02:50, 14.37it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0006577.xml


 79%|███████▊  | 7501/9530 [09:47<02:57, 11.43it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0018859.xml


 94%|█████████▎| 8920/9530 [11:41<02:44,  3.71it/s]

Could not parse paragraphs for /content/xml/xml/10.1371-journal.pone.0014631.xml


100%|██████████| 9530/9530 [12:30<00:00, 12.69it/s]


In [None]:
print((len(not_parsed_figs)))
print((len(not_parsed_paras)))

0
11


In [None]:
if not_parsed_figs:
  json.dump(not_parsed_figs, open('/content/xml/2022-06-11_not_parsed_figs_plos.json', 'w'))
if not_parsed_paras:
  json.dump(not_parsed_paras, open('/content/xml/2022-06-11_not_parsed_paras_plos.json', 'w'))

In [None]:
!zip -r /content/parsed_plos_fig.zip /content/xml/parsed_plos_fig

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0221397_fig.json (deflated 67%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0066370_fig.json (deflated 63%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0053066_fig.json (stored 0%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pntd.0007311_fig.json (deflated 57%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0098200_fig.json (deflated 66%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0057192_fig.json (deflated 67%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0180284_fig.json (deflated 51%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0034023_fig.json (deflated 70%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0055192_fig.json (deflated 65%)
  adding: content/xml/parsed_plos_fig/10.1371-journal.pone.0213268_fig.json (deflated 71%)
 

In [None]:
!zip -r /content/parsed_plos_para.zip /content/xml/parsed_plos_para

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0239847_paragraphs.json (deflated 60%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0114982_paragraphs.json (deflated 65%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0088945_paragraphs.json (deflated 63%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0025172_paragraphs.json (deflated 63%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0028920_paragraphs.json (deflated 50%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0126932_paragraphs.json (deflated 62%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0236556_paragraphs.json (deflated 72%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0194429_paragraphs.json (deflated 61%)
  adding: content/xml/parsed_plos_para/10.1371-journal.pone.0103389_paragraphs.json (deflated 65%)
  adding: content

In [None]:
plos = Biorxiv_Plos_Parser('/content/xml/xml/10.1371-journal.pntd.0010019.xml', 'plos')

In [None]:
os.mkdir('xml/not_working')
json.dump('/content/xml/xml/10.1371-journal.pntd.0010019.xml', open('xml/not_working/10.1371-journal.pntd.0010019.xml', 'w'))

In [None]:
path = '/content/xml/xml/10.1371-journal.pntd.0010019.xml'
tree = ET.parse(path)
with open('/content/xml/not_working/10.1371-journal.pntd.0010019.xml', 'wb') as f:
    tree.write(f)