## Specializing Parsers Here

Following the original code from CDE we can customize our parsers to great varieties. The 



In [1]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [2]:
from chemdataextractor.text.chem import SOLVENT_RE, INCHI_RE, SMILES_RE

In [3]:
# open and read files
f = open('../test_articles/paper0.pdf', 'rb')
doc = Document.from_file(f)
abstract = [11]
paras = doc.elements
cems = doc.cems

f1 = open('../test_articles/paper1.pdf', 'rb')
doc1 = Document.from_file(f1)
abstract1 = [7,8]
paras1 = doc1.elements
cems1 = doc1.cems

f2 = open('../test_articles/paper2.pdf', 'rb')
doc2 = Document.from_file(f2)
abstract2 = [7,8]
paras2 = doc2.elements
cems2 = doc2.cems

f3 = open('../test_articles/paper3.pdf', 'rb')
doc3 = Document.from_file(f3)
abstract3 = [10]
paras3 = doc3.elements
cems3 = doc3.cems

f4 = open('../test_articles/paper4.pdf', 'rb')
doc4 = Document.from_file(f4)
abstract4 = [12]
paras4 = doc4.elements
cems4 = doc4.cems

f5 = open('../test_articles/paper5.pdf', 'rb')
doc5 = Document.from_file(f5)
abstract5 = [3,4]
paras5 = doc5.elements
cems5 = doc5.cems

f6 = open('../test_articles/paper6.pdf', 'rb')
doc6 = Document.from_file(f6)
abstract6 = [5,6,7,8]
paras6 = doc6.elements
cems6 = doc6.cems

f7 = open('../test_articles/paper7.pdf', 'rb')
doc7 = Document.from_file(f7)
abstract7 = [11]
paras7 = doc7.elements
cems7 = doc7.cems

## Spealize PCE parsers

In this case we customize the pce parser to test all different combinations of pce interpretations possible in the text / literature.

In [4]:
class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

# prefix = abbrv_prefix | words_pref | hyphanated_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide()
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency')).hide()
hyphanated_pref = (I(u'power') + I(u'-') + I('conversion') + I(u'efficiency')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('PCE') + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I('range'))) + Optional((I('temperature') | I('range')))
                                    ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

pce_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'pce')
# pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (prefix + value + units)(u'pce')
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern=[
                Pce(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [5]:
Sentence.parsers.append(PceParser())
Paragraph.parsers.append(PceParser())

In [6]:
doc = Document(
    Paragraph("A pce of 10%"),
    Paragraph("A power-conversion efficiency (PCE) of 12%"), # good enough for parsing
    Paragraph("with the efficiency of 1%"),
    Paragraph("have the conversion efficiency of 10%"),
    Paragraph("power conversion efficiency [PCE] of 10^")
)

rec = doc.records.serialize()

In [7]:
rec

[{'pce_pattern': [{'value': '12', 'units': '%'}]}]

**Voc parsers**

In [8]:
class Voc(BaseModel):
    value = StringType()
    units = StringType()

Compound.voc_pattern = ListType(ModelType(Voc))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'V') | I(u'v') | I(u'volt') | I(u'volts'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'Voc') | I(u'voc')).hide()
words_pref = (I(u'open') + I(u'circuit') + I(u'voltage')).hide()
hyphanated_pref = (I(u'open') + I(u'-') + I('circuit') + I(u'voltage')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Voc') + Optional(rbrct) | I('open') + Optional(I('circuit')) + Optional((I('voltage')))
                                    ).hide() + Optional(lbrct + W('Voc') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('V')).hide()

voc_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'voc')
voc_second = (prefix + value + units)(u'voc')
voc_pattern = voc_first | voc_second

class VocParser(BaseParser):
    root = voc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            voc_pattern=[
                Voc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [9]:
Sentence.parsers.append(VocParser())
Paragraph.parsers.append(VocParser())

In [10]:
doc = Document(
    Paragraph("A open-circuit voltage of 12 volt"),
    Paragraph("A open-circuit voltage (Voc) of 144 V"),
    Paragraph("A Voc of 22 V"),
)

rec = doc.records.serialize()

In [11]:
rec

[{'voc_pattern': [{'value': '144', 'units': 'V'}]},
 {'voc_pattern': [{'value': '22', 'units': 'V'}]}]

**Jsc Parser**

In [40]:
class Jsc(BaseModel):
    value = StringType()
    units = StringType()

Compound.jsc_pattern = ListType(ModelType(Jsc))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()

# delim = R('^[;:,\./]$').hide()

units = Optional(W('/')).hide() + (
    R('^\[?mAcm[-–−‒]?2\]?$') |
    W('mAcm') + R('^[-–−‒]$') + W('2') | 
    R('^\[?mA\/cm2\]?$') | 
    W('mA') + W('^\/?$') + W('cm') +  W('2')
)('units').add_action(merge)

value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'Jsc') | I(u'jsc')).hide()
words_pref = (I(u'short') + I(u'circuit') + I(u'current') + I(u'density')).hide()
hyphanated_pref = (I(u'short') + I(u'-') + I('circuit') + I(u'current') + I(u'density')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Jsc') + Optional(rbrct) | I('short') + Optional(I('circuit')) + Optional((I('current'))) + Optional(I('density'))
                                    ).hide() + Optional(lbrct + W('Jsc') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('mAcm-2') | I('mA/cm2')).hide()

jsc_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'jsc')
jsc_second = (prefix + value + units)(u'jsc')
jsc_pattern = jsc_first | jsc_second

class JscParser(BaseParser):
    root = jsc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            jsc_pattern=[
                Jsc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [41]:
Sentence.parsers.append(JscParser())
Paragraph.parsers.append(JscParser())

In [42]:
doc = Document(
    Paragraph("A short-circuit current density (Jsc) of 12 mAcm-2"),
    Paragraph("A Jsc of 11 mA/cm2"),
    Paragraph("A short-circuit current density (Jsc) of 10 mAcm-2"),
)

rec = doc.records.serialize()

In [43]:
rec

[{'jsc_pattern': [{'value': '12', 'units': 'mAcm-2'}]},
 {'jsc_pattern': [{'value': '10', 'units': 'mAcm-2'}]}]

**Thickness Parser**

**Table Parser**

**Mol Parser**