## Extracting Abstracts

The problem is that we need to manually identify abstract location in articles instead of automatic extraction. 

In [1]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [2]:
from chemdataextractor.text.chem import SOLVENT_RE, INCHI_RE, SMILES_RE

In [3]:
# open and read files
f = open('../test_articles/paper0.pdf', 'rb')
doc = Document.from_file(f)
abstract = [11]
paras = doc.elements
cems = doc.cems

f1 = open('../test_articles/paper1.pdf', 'rb')
doc1 = Document.from_file(f1)
abstract1 = [7,8]
paras1 = doc1.elements
cems1 = doc1.cems

f2 = open('../test_articles/paper2.pdf', 'rb')
doc2 = Document.from_file(f2)
abstract2 = [7,8]
paras2 = doc2.elements
cems2 = doc2.cems

f3 = open('../test_articles/paper3.pdf', 'rb')
doc3 = Document.from_file(f3)
abstract3 = [10]
paras3 = doc3.elements
cems3 = doc3.cems

f4 = open('../test_articles/paper4.pdf', 'rb')
doc4 = Document.from_file(f4)
abstract4 = [12]
paras4 = doc4.elements
cems4 = doc4.cems

f5 = open('../test_articles/paper5.pdf', 'rb')
doc5 = Document.from_file(f5)
abstract5 = [3,4]
paras5 = doc5.elements
cems5 = doc5.cems

f6 = open('../test_articles/paper6.pdf', 'rb')
doc6 = Document.from_file(f6)
abstract6 = [5,6,7,8]
paras6 = doc6.elements
cems6 = doc6.cems

f7 = open('../test_articles/paper7.pdf', 'rb')
doc7 = Document.from_file(f7)
abstract7 = [11]
paras7 = doc7.elements
cems7 = doc7.cems

## Spealize PCE parsers

In [4]:
class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide()
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency')).hide()
hyphanated_pref = (I(u'power') + I(u'-') + I('conversion') + I(u'efficiency')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = abbrv_prefix | words_pref | hyphanated_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

# pce_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'pce')

pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (value + units + prefix)(u'pce')
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern=[
                Pce(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

NameError: name 'units' is not defined

In [None]:
doc = Document(
#     Heading('5,10,15,20-Tetra(4-carboxyphenyl)porphyrin (3).'),
#     Paragraph('a glass-transition temperature (Tg) of 20°C'),
    Paragraph('open circuit voltage (voc) equal to 0.7 V'),
    Paragraph('Voc of 0.8 V'),
    Paragraph('open circuit voltage [voc] of 0.9 V'),
    Paragraph('power conversion efficiency (pce) of 10 %'),
    Paragraph('It has been found that PSHQ4 has a Tg of ca. 130°'),
#     Paragraph('with the short-circuit current density (Jsc) of 12 mAcm-2'),
#     Paragraph('material with a fill factor (ff) of 0.2'),
)

rec = doc.records.serialize()

In [None]:
rec