## Specializing Parsers Here

Following the original code from CDE we can customize our parsers to great varieties. The 



In [2]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.reader import acs,base,cssp,HtmlReader,NlmXmlReader,PdfReader,RscHtmlReader,XmlReader
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

# import modules for scraping information from local xml files
from chemdataextractor.scrape import pub
from chemdataextractor.scrape.pub import rsc
from chemdataextractor.scrape.pub import springer
from chemdataextractor.scrape.pub import nlm

XMLReader not really working, needs to be fixed
HTMLReader and pdfReader work

In [4]:
# open and read files
f = open('../one_compound/36 ja200314m.pdf', 'rb')
doc = Document.from_file(f)
paras = doc.elements
cems = doc.cems

# open and read files
f1 = open('../one_compound/39 ja101888b.pdf', 'rb')
doc1 = Document.from_file(f1)
paras1 = doc.elements
cems1 = doc.cems

In [4]:
# import os
# directory = '../one_compound/'
# files = [item for item in os.listdir(directory) if os.path.isfile(os.path.join(directory, item))]

# doc = []
# para = []
# cem = []
# for file in files:
#     f = open("../one_compound/" + f, 'rb')
#     doc.append(Document.from_file(f))
#     para.append(doc.elements)
#     cem.append(doc.cems)

## Spealize PCE parsers

In this case we customize the pce parser to test all different combinations of pce interpretations possible in the text / literature.

In [5]:
class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide()
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency')).hide()
hyphanated_pref = (I(u'power') + I(u'-') + I('conversion') + I(u'efficiency')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('PCEs') + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I('range'))) + Optional((I('temperature') | I('range')))
                                    ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

pce_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'pce')
# pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (prefix + value + units)(u'pce')
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern = [
                Pce(
                    value = first(result.xpath('./value/text()')),
                    units = first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [6]:
Sentence.parsers.append(PceParser())
Paragraph.parsers.append(PceParser())

In [7]:
doc = Document(
    Paragraph("A pce of 10%"),
    Paragraph("materials with power-conversion efficiency (PCE) of 12% and 20%"), # good enough for parsing
    Paragraph("with the efficiency of 1%"),
    Paragraph("have the conversion efficiency of 10%"),
)

doc.records.serialize()

[]

**Voc parsers**

In [8]:
class Voc(BaseModel):
    value = StringType()
    units = StringType()

Compound.voc_pattern = ListType(ModelType(Voc))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'V') | I(u'v') | I(u'volt') | I(u'volts'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'Voc') | I(u'voc')).hide()
words_pref = (I(u'open') + I(u'circuit') + I(u'voltage')).hide()
hyphanated_pref = (I(u'open') + I(u'-') + I('circuit') + I(u'voltage')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Voc') + Optional(rbrct) | I('open') + Optional(I('circuit')) + Optional((I('voltage')))
                                    ).hide() + Optional(lbrct + W('Voc') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | units).hide()

voc_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'voc')
voc_second = (prefix + value + units)(u'voc')
voc_pattern = voc_first | voc_second

class VocParser(BaseParser):
    root = voc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            voc_pattern=[
                Voc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [9]:
Sentence.parsers.append(VocParser())
Paragraph.parsers.append(VocParser())

In [10]:
doc = Document(
    Paragraph("A open-circuit voltage of 12 volt"),
    Paragraph("A open-circuit voltage (Voc) of 144 V"),
    Paragraph("A Voc of 22 V"),
)

rec = doc.records.serialize()

In [11]:
rec

[{'voc_pattern': [{'value': '144', 'units': 'V'}]},
 {'voc_pattern': [{'value': '22', 'units': 'V'}]}]

**Jsc Parser**

In [12]:
class Jsc(BaseModel):
    value = StringType()
    units = StringType()

Compound.jsc_pattern = ListType(ModelType(Jsc))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()

# delim = R('^[;:,\./]$').hide()

units = Optional(W('/')).hide() + (
    R('^\[?mAcm[-–−‒]?2\]?$') |
    W('mAcm') + R('^[-–−‒]$') + W('2') | 
    R('^\[?mA\/cm2\]?$') | 
    W('mA') + W('^\/?$') + W('cm') +  W('2')
)('units').add_action(merge)

value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'Jsc') | I(u'jsc')).hide()
words_pref = (I(u'short') + I(u'circuit') + I(u'current') + I(u'density')).hide()
hyphanated_pref = (I(u'short') + I(u'-') + I('circuit') + I(u'current') + I(u'density')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Jsc') + Optional(rbrct) | I('short') + Optional(I('circuit')) + Optional((I('current'))) + Optional(I('density'))
                                    ).hide() + Optional(lbrct + W('Jsc') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('mAcm-2') | I('mA/cm2')).hide()

jsc_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'jsc')
jsc_second = (prefix + value + units)(u'jsc')
jsc_pattern = jsc_first | jsc_second

class JscParser(BaseParser):
    root = jsc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            jsc_pattern=[
                Jsc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [13]:
Sentence.parsers.append(JscParser())
Paragraph.parsers.append(JscParser())

In [14]:
doc = Document(
    Paragraph("A short-circuit current density (Jsc) of 12 mAcm-2"),
    Paragraph("A Jsc of 11 mA/cm2"),
    Paragraph("A short-circuit current density (Jsc) of 10 mAcm-2"),
)

rec = doc.records.serialize()

In [15]:
rec

[{'jsc_pattern': [{'value': '12', 'units': 'mAcm-2'}]},
 {'jsc_pattern': [{'value': '10', 'units': 'mAcm-2'}]}]

**Molecular Weight Parser**

In [16]:
class Mw(BaseModel):
    value = StringType()
    units = StringType()

Compound.mw_pattern = ListType(ModelType(Mw))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()

units = Optional(W('/')).hide() + (
    R('^\[?kgmol[-–−‒]?1\]?$') |
    W('kgmol') + R('^[-–−‒]$') + W('1') | 
    R('\[?kg[\/]?mol\]?$') | 
    W('kg') + W('^\/?$') + W('mol')
)('units').add_action(merge)

value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'Mw') | I(u'mw')).hide()
words_pref = (I(u'molecular') + I(u'weight')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('Mw') + Optional(rbrct) | I('molecular') + Optional(I('weight'))
                                    ).hide() + Optional(lbrct + W('Mw') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('kgmol-1') | I('kg/mol')).hide()

mw_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'mw')
mw_second = (prefix + value + units)(u'mw')
mw_pattern = mw_first | mw_second

class MwParser(BaseParser):
    root = mw_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            mw_pattern=[
                Mw(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

In [17]:
Sentence.parsers.append(MwParser())
Paragraph.parsers.append(MwParser())

In [18]:
doc = Document(
    Paragraph("the polymer weighs about 100 kgmol-1"),
    Paragraph("PBDT has a molecular weight (Mw) of 110 kg/mol"),
    Paragraph("PSEHTT has a molecular weight (Mw) of 120 kgmol-1"),
    Paragraph("this PSEHTT has a molecular weight of 200 kgmol-1"),
)

rec = doc.records.serialize()

In [19]:
rec

[{'names': ['PBDT'],
  'mw_pattern': [{'value': '200', 'units': 'kgmol-1'},
   {'value': '120', 'units': 'kgmol-1'}]}]

**FF Parser**

In [20]:
class Ff(BaseModel):
    value = StringType()
    units = StringType()

Compound.ff_pattern = ListType(ModelType(Ff))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'FF') | I(u'ff')).hide()
words_pref = (I(u'fill') + I(u'factor')).hide()
hyphanated_pref = (I(u'fill') + I(u'-') + I('factor')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('FF') + Optional(rbrct) | I('fill') + Optional(I('factor'))
                                    ).hide() + Optional(lbrct + W('FF') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

ff_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'ff')
ff_second = (prefix + value + units)(u'ff')
ff_pattern = ff_first | ff_second

class FfParser(BaseParser):
    root = ff_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            ff_pattern=[
                Ff(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_ff(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(FfParser())

    cde_senteces = [Sentence(sent).records.serialize()
                    for sent in list_of_sentences]
    return cde_senteces


In [21]:
Sentence.parsers.append(FfParser())
Paragraph.parsers.append(FfParser())

In [22]:
doc = Document(
    Paragraph("the fill factor (FF) is about 20%"),
    Paragraph("FF is about 30%"),
)

rec = doc.records.serialize()

In [23]:
rec

[{'ff_pattern': [{'value': '20', 'units': '%'}]},
 {'ff_pattern': [{'value': '30', 'units': '%'}]}]

### Compare to the testing dataset

In this case we extract our information from abstracts from all 8 papers and then compare to what they have originally in the text. 

In [24]:
test = pd.read_csv('../test_articles/test.csv',sep='\t',delimiter='\\t')
test_pce = test['PCE_ave']
test_ff = test['FF']
test_voc = test['Voc (V)']

  """Entry point for launching an IPython kernel.


In [26]:
# abstract_doc = doc.elements[11]
# abstract1_doc = doc1.elements[7] + doc1.elements[8]
# abstract2_doc = doc2.elements[7] + doc2.elements[8]
# abstract3_doc = doc3.elements[10]
# abstract4_doc = doc4.elements[12]
# abstract5_doc = doc5.elements[3] + doc5.elements[4]
# abstract6_doc = doc6.elements[5] + doc6.elements[6] + doc6.elements[7] + doc6.elements[8]
# abstract7_doc = doc7.elements[11]

In [28]:
# goes back to the doc in paper0
# abstract_doc = doc.elements[11]
# abstract_doc

In [29]:
# doc = Document(
#     Heading("The followings are abstracts"),
#     abstract_doc,
#     abstract1_doc,
#     abstract2_doc,
#     abstract3_doc,
#     abstract4_doc,
#     abstract5_doc,
#     abstract6_doc,
#     abstract7_doc
# )

# rec = doc.records.serialize()

In [30]:
rec

[{'ff_pattern': [{'value': '20', 'units': '%'}]},
 {'ff_pattern': [{'value': '30', 'units': '%'}]}]

from the above we can see the following results, the tabulated 

| Paper          | PCE (real)   | PCE (found)  |
| :------------- | :----------: | -----------: |
| paper0         | 2.55,5.86    | 2.55, 5.86   |
| paper1         | 5.52, 7.04   | 5.52, 1.76 (not exp), 6.3|
| paper2         | 1.19, 0.79   | P1 and P2 problem|
| paper3         | N/A          | no PCE reported in abst|
| paper4         | 6.74, 4.44   | 6.74         |
| paper5         | 1.33, None   | P1 and P2 problem |
| paper6         | 0.22, 0.31,1.38 | 1.38      |
| paper7         | 3.04, 1.91   | 1.91         |

In [31]:
doc_form = Document(
    Heading("The followings are abstracts"),
    doc,
    doc1,
#     doc2,
#     doc3,
#     doc4,
#     doc5,
#     doc6,
#     doc7
)

rec = doc_form.records.serialize()

In [32]:
rec

[{'ff_pattern': [{'value': '20', 'units': '%'}]},
 {'ff_pattern': [{'value': '30', 'units': '%'}]},
 {'names': ['2,1,3-ben- zothiadiazole', 'BT']},
 {'names': ['4,7-dithien-2-yl-2,1,3-benzothiadiazole', 'DTBT']},
 {'names': ['ﬂuorenes,6a carbazoles,6b dibenzosiloles,2c dithienosiloles,6c ladder oligo-p- phenylenes,6d']},
 {'names': ['pyrrole']},
 {'names': ['PBDTTPDa']},
 {'names': ['P(o-tolyl)3']},
 {'names': ['toluene']},
 {'names': ['platinum']},
 {'names': ['Bu4NBF4']},
 {'names': ['CH3CN']},
 {'names': ['2,6-bis(trimethyltin)-4,8-di(2-ethylhexy- loxyl)benzo[1,2-b:4,5-b′]dithiophene7a and 1,3-dibromo-5-octylth- ieno-[3,4-c]pyrrole-4,6-dione8e']},
 {'names': ['o-dichlo- robenzene', 'ODCB']},
 {'names': ['BDT']},
 {'names': ['alkoxy']},
 {'names': ['ITO']},
 {'names': ['LiF']},
 {'names': ['Al']},
 {'names': ['poly(3,4-ethylenedioxy- thiophene )']},
 {'names': ['poly(styrenesulfonate)']},
 {'names': ['lithium ﬂuoride ( LiF )']},
 {'names': ['low-band-gap thieno[3,4-c]pyrrole-4,6- dio

In [33]:
import json

In [34]:
with open("data_file.json", "w") as write_file:
    json.dump(rec, write_file)

**Table Parser**

Try samples on some xml files extracted before. We can utilize CDE's table parser in the scraper section. They have already included the 

In [35]:
from chemdataextractor import scrape as scrape

**Mol Parser**