## Test on Properties

This notebook focuses on customizing parsers that can be used to extract information

In this case we are going to try to check for data patching and parsers at the same time. If the extracted properties misalign with compounds extracted, the project would be meaningless.


10/15/2019 meeting 

Challenge:

1. in parser, if a full name with a abbrev in () it will not recognize
2. too messy data paragraphs --> cannot efficiently recognize 

In [1]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
import chemdataextractor.model as model
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first
from chemdataextractor.parse.actions import strip_stop
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore
from chemdataextractor.parse.cem import chemical_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [2]:
from chemdataextractor.text.chem import SOLVENT_RE, INCHI_RE, SMILES_RE

In [3]:
SOLVENT_RE.findall(u'λmax(CH2Cl2)/nm')

['CH2Cl2']

In [6]:
SOLVENT_RE.findall(u"dissolved in metahne with DIO as the additive")

[]

In [4]:
# open and read files
f = open('../test_articles/paper0.pdf', 'rb')
doc = Document.from_file(f)
abstract = [11]

f1 = open('../test_articles/paper1.pdf', 'rb')
doc1 = Document.from_file(f1)
abstract1 = [7,8]

f2 = open('../test_articles/paper2.pdf', 'rb')
doc2 = Document.from_file(f2)
abstract2 = [7,8]

f3 = open('../test_articles/paper3.pdf', 'rb')
doc3 = Document.from_file(f3)
abstract3 = [10]

f4 = open('../test_articles/paper4.pdf', 'rb')
doc4 = Document.from_file(f4)
abstract4 = [12]

f5 = open('../test_articles/paper5.pdf', 'rb')
doc5 = Document.from_file(f5)
abstract5 = [3,4]

f6 = open('../test_articles/paper6.pdf', 'rb')
doc6 = Document.from_file(f6)
abstract6 = [5,6,7,8]

f7 = open('../test_articles/paper7.pdf', 'rb')
doc7 = Document.from_file(f7)
abstract7 = [11]

In [5]:
# split the paragraph into elements
paras = doc.elements
cems = doc.cems
doc.records.serialize()

[{'names': ['Isoindigo-']},
 {'names': ['bislactam']},
 {'names': ['hydrogens']},
 {'names': ['phenyl']},
 {'names': ['oxygens']},
 {'names': ['oxindoles']},
 {'names': ['triphenylamine']},
 {'names': ['phenyl- carbazole']},
 {'names': ['D − A']},
 {'names': ['4,8-bis(5-(2-ethylhexyl)- thiophen-2-yl)benzo[1,2-b:4,5-b′]dithiophene ( 2D-BDT )']},
 {'names': ['2D-BDT-containing D − π − A']},
 {'names': ['Mn [ kg mol−1 ]']},
 {'names': ['411']},
 {'names': ['34']},
 {'names': ['76']},
 {'names': ['long, branched 2- octyldodecyl alkyl']},
 {'names': ['tris(dibenzylideneacetone)dipalladium']},
 {'names': ['Mn']},
 {'names': ['benzene']},
 {'names': ['ferrocene']},
 {'names': ['−[Eonset ferrocene + 4.8 ] V']},
 {'names': ['methyl substituted alkyl chains']},
 {'names': ['methyl']},
 {'names': ['I ds']},
 {'names': ['WC o L 2']},
 {'names': ['V t']},
 {'names': ['PBDT-TIIG-']},
 {'names': ['P3HT']},
 {'names': ['PCBM']},
 {'names': ['μ e']},
 {'names': ['f [V]']},
 {'names': ['alkyl']},
 {'nam

PCE and FF works fine, as well as other quantities end in %. For other units, further customization required

Most of properties from literature have the same layout, so if one example works, the rest of them should work too.

Any unit with simple expression (1 component) is easy to extract. Otherwise a combination is needed.

In [28]:
class Jsc(BaseModel):
    value = StringType()
    units = StringType()

Compound.jsc_pattern = ListType(ModelType(Jsc))

abbrv_prefix = (I(u'jsc') | I(u'Jsc') ).hide()
words_pref = (I(u'short') + I(u'circuit') + I(u'current') + I(u'density')).hide()
hyphanated_pref = (I(u'short-circuit') + I(u'current') + I(u'density')).hide()

prefix = abbrv_prefix | words_pref | hyphanated_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'mA') + W(u'/') + W(u'cm') + W('2'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

jsc_first= (prefix + ZeroOrMore(common_text) + value + units)(u'jsc')
jsc_second = (value + units + prefix)(u'jsc')

jsc_pattern = jsc_first | jsc_second

class JscParser(BaseParser):
    root = jsc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            jsc_pattern=[
                Jsc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_jsc(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(JscParser())

    cde_senteces = [Sentence(sent).records.serialize() for sent in list_of_sentences]
    return cde_senteces

In [29]:
class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide()
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency')).hide()
hyphanated_pref = (I(u'power-conversion') + I(u'efficiency')).hide()
prefix = abbrv_prefix | words_pref | hyphanated_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (value + units + prefix)(u'pce')
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern=[
                Pce(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound


def parse_pce(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(PceParser())

    cde_senteces = [Sentence(sent).records.serialize()
                    for sent in list_of_sentences]
    return cde_senteces


In [31]:
class Eqe(BaseModel):
    value = StringType()
    units = StringType()

Compound.eqe_pattern = ListType(ModelType(Eqe))

abbrv_prefix = (I(u'EQE')).hide()
words_pref = (I(u'external') + I(u'quantum') + I(u'efficiency')).hide()
prefix = abbrv_prefix | words_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

eqe_first = (prefix + ZeroOrMore(common_text) + value + units)(u'eqe')
eqe_second = (value + units + prefix)(u'eqe')
eqe_pattern = eqe_first | eqe_second

class EqeParser(BaseParser):
    root = eqe_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            eqe_pattern=[
                Eqe(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound


def parse_eqe(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(EqeParser())

    cde_senteces = [Sentence(sent).records.serialize()
                    for sent in list_of_sentences]
    return cde_senteces


In [33]:
class Voc(BaseModel):
    value = StringType()
    units = StringType()

Compound.voc_pattern = ListType(ModelType(Voc))

abbrv_prefix = (I(u'VOC') | I(u'voc') | I(u'Voc') ).hide()
words_pref = (I(u'open') + I(u'circuit') + I(u'voltage')).hide()
hyphanated_pref =(I(u'open-circuit') + I(u'voltage')).hide()
prefix = abbrv_prefix | words_pref | hyphanated_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'V') | I(u'volts') | I(u'volt'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

voc_first= (prefix + ZeroOrMore(common_text) + value + units)(u'voc')
voc_second = (value + units + prefix)(u'voc')

voc_pattern = voc_first | voc_second

class VocParser(BaseParser):
    root = voc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            voc_pattern=[
                Voc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_voc(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(VocParser())

    cde_senteces = [Sentence(sent).records.serialize() for sent in list_of_sentences]
    return cde_senteces

In [39]:
Sentence.parsers.append(JscParser())
Paragraph.parsers.append(JscParser())

Sentence.parsers.append(MwParser())
Paragraph.parsers.append(MwParser())

Sentence.parsers.append(PceParser())
Paragraph.parsers.append(PceParser())

Sentence.parsers.append(VocParser())
Paragraph.parsers.append(VocParser())

Sentence.parsers.append(EqeParser())
Paragraph.parsers.append(EqeParser())

Sentence.parsers.append(FfParser())
Paragraph.parsers.append(FfParser())

In [80]:
doc = Document(
    Heading('5,10,15,20-Tetra(4-carboxyphenyl)porphyrin (3).'),
    Paragraph('m.p. 90°C.'),
    Paragraph('pce 15 %'),
    Paragraph('fill factor (FF) of 20%'),
    Paragraph('open-circuit voltage (Voc) of 12 V'),
    Paragraph('EQE 57 %'),
    Paragraph('Mw 12 kg/mol'),
    Paragraph('Jsc 12 mA/cm2'),
)

rec = doc.records.serialize()

In [81]:
rec

[{'names': ['5,10,15,20-Tetra(4-carboxyphenyl)porphyrin'],
  'labels': ['3'],
  'melting_points': [{'value': '90', 'units': '°C'}],
  'mw_pattern': [{'value': '12'}],
  'pce_pattern': [{'value': '15', 'units': '%'}],
  'eqe_pattern': [{'value': '57', 'units': '%'}]}]

In this case we can test on doc7 on data patching. 

In [57]:
doc7[11]

In [74]:
doc = Document(
    Heading('Abstract:'),
    Paragraph('We report the synthesis, properties, and photo- voltaic applications of new π-conjugated polymers having thiophene, 3,4-dihexylthiophene, and 1,3,4-oxadiazole (OXD) or 1,3,4-thiadiazole (TD) units in the main chain, denoted as P1 and P2. They were synthesized by the Stille coupling reaction of 2,5- bis(trimethylstannyl)thiophene and the corresponding monomers of 2,5-bis(5′-bromo-3′,4′-dihexylthien-2′-yl)-1,3,4-oxadiazole or 2,5-bis(5′-bromo-3′,4′-dihexylthien-2′-yl)-1,3,4-thiadiazole, re- spectively. '),
    Paragraph('The experimental results indicated that the introduc- tion of an electron-accepting moiety of OXD or TD lowered the highest occupied molecular orbital (HOMO) energy levels, resulting in the higher the open-circuit voltage (Voc) values of polymer solar cells (PSCs). Indeed, the PSCs of P1 and P2 showed high Voc values in the range 0.80−0.90 V. The highest ﬁeld-eﬀect transistor (FET) mobilities of P1 and P2 with the OXD and TD moieties, respectively, were 1.41 × 10−3 and 8.81 × 10−2 cm2 V−1 s−1. '),
    Paragraph('The higher mobility of P2 was related to its orderly nanoﬁbrillar structure, as evidenced from the TEM images. Moreover, the higher absorption coeﬃcient and smaller band gap of P2 provided a more eﬃcient light-harvesting ability. '),
    Paragraph('The power conversion eﬃciency (PCE) of the PSC based on P2:PCBM = 1:1 (w/w) reached 3.04 % with a short-circuit current density (Jsc) value of 6.60 mA/cm2, a Voc value of 0.80 V, and a fill factor (as) value of 57.6% during the illumination of AM 1.5, 100 mW/cm2. '),
    Paragraph('In comparison, the electron-accepting moiety exhibited an inferior device performance (FET mobility = 2.10 × 10−4 cm2 V−1 s−1 and PCE = 1.91%). The experimental results demonstrated that incorporating the electron-acceptor moiety into the polythiophene backbone could enhance the device performance due to the low-lying HOMO levels, compact packing structure, and high charge carrier mobility. This is the ﬁrst report for the achievement of PCE > 3% using PSCs based on polythiophenes having TD units in the main chain.')
)

rec = doc.records.serialize()

In [75]:
rec

[{'names': ['thiophene']},
 {'names': ['3,4-dihexylthiophene']},
 {'names': ['1,3,4-thiadiazole', 'TD']},
 {'names': ['2,5- bis(trimethylstannyl)thiophene']},
 {'names': ['2,5-bis(5′-bromo-3′,4′-dihexylthien-2′-yl)-1,3,4-oxadiazole']},
 {'names': ['2,5-bis(5′-bromo-3′,4′-dihexylthien-2′-yl)-1,3,4-thiadiazole']},
 {'names': ['PCBM']},
 {'voc_pattern': [{'value': '0.80', 'units': 'V'}]},
 {'names': ['polythiophene']},
 {'names': ['polythiophenes']},
 {'names': ['OXD', '1,3,4-oxadiazole']}]

In [53]:
doc7[11].records.serialize()

[{'names': ['thiophene']},
 {'names': ['3,4-dihexylthiophene']},
 {'names': ['1,3,4-oxadiazole', 'OXD']},
 {'names': ['1,3,4-thiadiazole', 'TD']},
 {'names': ['2,5- bis(trimethylstannyl)thiophene']},
 {'names': ['2,5-bis(5′-bromo-3′,4′-dihexylthien-2′-yl)-1,3,4-oxadiazole']},
 {'names': ['2,5-bis(5′-bromo-3′,4′-dihexylthien-2′-yl)-1,3,4-thiadiazole']},
 {'names': ['OXD']},
 {'names': ['OXD']},
 {'names': ['PCBM']},
 {'voc_pattern': [{'value': '0.80', 'units': 'V'}]},
 {'names': ['polythiophene']},
 {'names': ['polythiophenes']}]

In the abstract of doc7, it gives the following values:

1. Voc
2. PCE
3. Mobility
4. Blend ratio

and compounds 

1. P1
2. P2

from above we see no P1 and P2, but only the two units in the main chain OXD and TD. 