This program is used to parse a single HTML file into plain text and detect valid sentences.

Last modified: 07.06

In [1]:
import os
import re
import bs4
import numpy as np
import glob
import torch
import itertools
import copy
from chempp.article import (
    ArticleElementType,
    ArticleElement
)
from chempp.article_constr import search_html_doi_publisher
from chempp.table import (
    Table,
    TableCell,
    TableRow,
    set_table_style
)
from chempp.article_constr import ArticleFunctions, check_html_publisher, search_xml_doi_publisher
from chempp.section_extr import (
    html_table_extract_wiley,
    html_table_extract_rsc,
    html_table_extract_springer,
    html_table_extract_acs,
    html_table_extract_elsevier,
    xml_table_extract_acs,
    xml_table_extract_elsevier,
    get_html_table_rows,
    get_xml_text_iter,
    pop_xml_element_iter
)
from chempp.constants import *

from chemdataextractor.doc import Paragraph
from bs4 import BeautifulSoup
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
from tqdm.auto import tqdm


## Construct article

In [2]:
folder = r'data/rop-database-v2'
# folder = r'data/download/gs'
# folder = r'.'
file_name = '*10.1007@s00289-011-0647-0*'
# file_name = '*10.1021@ma0114887*'  # ACS
# file_name = '*10.1016@j.polymer.2005.11.025*'  # ELSEVIER
# file_name = '*10.1007@s10965-013-0244-z*'  # springer
# file_name = '*10.1039@d0py00155d*'  # RSC
# file_name = '*10.1002@pola.22406*'  # wiley
# file_name = '*10.1021^slcr940282j*' # ACS HTML
# file_name = '*10.1016^slj.chempr.2020.06.003*'  # elsevier HTML
suffix = 'html'

if suffix == 'html':
    path_list = list()
    for file_path in glob.glob(os.path.join(folder, file_name + ".html")):
        path_list.append(file_path)
    file_path = path_list[0]

elif suffix == 'xml':
    path_list = list()
    for file_path in glob.glob(os.path.join(folder, file_name + ".xml")):
        path_list.append(file_path)
    file_path = path_list[0]

file_path = os.path.normpath(file_path)
print(file_path)

data\rop-database-v2\10.1007@s00289-011-0647-0.html


In [3]:
publisher_file_dict = {publisher: [] for publisher in SUPPORTED_HTML_PUBLISHERS}

if file_path.endswith('html'):
    file_path = os.path.normpath(file_path)

    with open(file_path, 'r', encoding='utf-8') as f:
        contents = f.read()
    soup = BeautifulSoup(contents, 'lxml')

    # get publisher and doi
    doi, publisher = search_html_doi_publisher(soup)

    if publisher == 'elsevier':
        # allow illegal nested <p>
        soup = BeautifulSoup(contents, 'html.parser')
    elif publisher == 'rsc':
        # allow nested <span>
        soup = BeautifulSoup(contents, 'html5lib')

    article_construct_func = getattr(ArticleFunctions, f'article_construct_html_{publisher}')
    article, component_check = article_construct_func(soup=soup, doi=doi)
elif file_path.endswith('xml'):
    tree = ET.parse(file_path)
    root = tree.getroot()

    doi, publisher = search_xml_doi_publisher(root)

    article_construct_func = getattr(ArticleFunctions, f'article_construct_xml_{publisher}')
    article, component_check = article_construct_func(root=root, doi=doi)
print(publisher)

springer


---

## Or load pre-defined article

In [6]:
folder = r'results/IE-results-v2.2/rop-database-v2_processed'
file_name = '*10.1021&sl;acs.macromol.9b01777*'

path_list = list()
for file_path in glob.glob(os.path.join(folder, file_name)):
    path_list.append(file_path)
file_path = path_list[0]

article = torch.load(file_path)

## Function test

In [8]:
for para in article.paragraphs:
    if 'Limonene oxide displays a promising' in para.text:
        break

In [11]:
criterion = 'c3'
article_property_info = get_property_info(article, criterion)

for property_info_dict in article_property_info:
    property_info_dict['doi'] = article.doi
    property_info_dict['file-path'] = os.path.join('html-files', '???')
    property_info_dict['criterion'] = criterion if criterion is not None else ''

In [12]:
article_property_info

[{'sentence': 'Our group recently investigated the ceiling temperature of PLC and observed depolymerization at 60 °C at elevated conversions.',
  'sentence-id': 1,
  'material': 'PLC',
  'property': 'ceiling temperature',
  'value': '60 °C',
  'type': 'Tc',
  'reliability': 0.1,
  'doi': '10.1021/acs.macromol.9b01777',
  'file-path': 'html-files\\???',
  'criterion': 'c3'}]

--------------

## Stored functions; don't delete!

In [None]:
# IMPORTANT! Don't delete
# how the table extraction functions are used

## rsc
tables = soup.find_all('div', {"class": "rtable__wrapper"})
table_div = tables[0]  # or 1, 2, ...

tbl = html_table_extract_rsc(table_div)

## wiley
tables = soup.find_all('div', {"class": "article-table-content"})
table_div = tables[0]

tbl = html_table_extract_wiley(table_div)

# springer
tables = soup.find_all('div', {"class": "Table"})
table_div = tables[0]

tbl = html_table_extract_springer(table_div)

## acs---html
tables = soup.find_all('div', {"class": "NLM_table-wrap"})
table_div = tables[0]

tbl = html_table_extract_acs(table_div)

## acs---xml
body = root.findall('body')[0]
tables = list(body.iter(tag=r'table-wrap'))
xml_table = tables[0]

tbl = xml_table_extract_acs(xml_table)

## elsevier---html
tables = soup.find_all('div', {"class": "tables"})
table_div = tables[0]

tbl = html_table_extract_elsevier(table_div)

## elsevier--xml
ori_txt = root.findall(r'{http://www.elsevier.com/xml/svapi/article/dtd}originalText')[0]
doc = ori_txt.findall(r'{http://www.elsevier.com/xml/xocs/dtd}doc')[0]

tables = list(doc.iter(tag=r'{http://www.elsevier.com/xml/common/dtd}table'))
xml_table = tables[0]
tbl = xml_table_extract_elsevier(xml_table)


---

In [None]:
# IMPORTANT! Don't delete
# Save table into HTML

soup = BeautifulSoup()
head = soup.new_tag('head', style="width: 85%; margin:auto auto;")
soup.insert(0, head)
title = soup.new_tag('title')
head.insert(0, title)
title.insert(0, article.title)

set_table_style(head)

html_body = soup.new_tag('body', style="width: 85%; margin:auto auto;")
soup.insert(len(soup), html_body)

_ = write_table(table, html_body)

with open('test.html', 'w', encoding='utf-8') as outfile:
    outfile.write(soup.prettify())

---

In [None]:
# IMPORTANT! Don't delete
print("[INFO] extracting information...")

interested_info = dict()

for i, paragraph in enumerate(article.sections):
    if paragraph.element_type != ArticleElementType.PARAGRAPH:
        continue
    text = paragraph.text
    text = break_number_unit(text, TEMPERATURE_UNITS + ENERGY_UNITS)
    paragraph.text = text
    article.sections[i] = paragraph

    para = Paragraph(text)
    sents, sent_ranges = convert_cde_paragraph_to_sentences(para)
    valid_sent_ids, _ = locate_valid_sents_heuristic(sents)

    if valid_sent_ids:
        valid_sents = [sents[idx] for idx in valid_sent_ids]
        valid_sent_ranges = [sent_ranges[idx] for idx in valid_sent_ids]
        interested_info[i] = valid_sent_ranges

for k in interested_info.keys():
    interested_info[k] = sort_tuple_by_first_element(interested_info[k])

for para_id, sent_spans in interested_info.items():
    para = article.sections[para_id].text.strip()
    print(sent_spans)
    show_box_markup(para, sent_spans)
    print()