In [2]:
# Overkill on Imports, but better to have everything than miss something
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.reader import acs,base,cssp,HtmlReader,NlmXmlReader,PdfReader,RscHtmlReader,XmlReader
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [3]:
#FF Parser 
class Ff(BaseModel):
    value = StringType()
    units = StringType()

Compound.ff_pattern = ListType(ModelType(Ff))

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units') 
#units = (W(u'%'|W(u' ')|I(u'percent')))('units').add_action(merge)

value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'FF') | I(u'ff')).hide()
words_pref = (I(u'fill') | I(u'fill') + I(u'factor')).hide()
hyphanated_pref = (I(u'fill') | I(u'fill') + I(u'-') + I('factor')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('FF') + Optional(rbrct) | I('fill') | I('ﬁll') + Optional(I('factor'))
                                    ).hide() + Optional(lbrct + W('FF') + rbrct) + Optional(W('=') | W('¼') | W(';') | W(',') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | I('to') |I('around')| I ('%')).hide()

ff_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + Optional(units))(u'ff')
ff_second = (prefix + value + Optional(units))(u'ff')
ff_third = (abbrv_prefix + prefix + value)(u'ff')
ff_pattern = ff_first|ff_second|ff_third



class FfParser(BaseParser):
    root = ff_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            ff_pattern=[
                Ff(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_ff(list_of_sentences):
    
    #Takes a list of sentences and parses for quantified PCE
    #information and relationships to chemicals/chemical labels
    

    Sentence.parsers.append(FfParser())

    cde_senteces = [Sentence(sent).records.serialize()
                    for sent in list_of_sentences]
    return cde_senteces

Sentence.parsers.append(FfParser())
Paragraph.parsers.append(FfParser())

In [18]:

d1 = Paragraph('Plymer:fullerene blend was varied and had a fill factor (FF) of 0.48.')

d2 = Paragraph('for annealed device with 93% RR, JSC = 6.28 mA cm−2, VOC = 0.6 V, FF= 39.4%, PCE= 1.8%; for not-annealed device with 90.7% RR, JSC = 3.27 mA cm−2, VOC = 0.62 V, FF= 37.0%, PCE= 0.9%; for annealed device with 90.7% RR, JSC = 3.07 mA cm−2,VOC = 0.61 V, FF= 32.1%, PCE= 0.7%')

d3 = Paragraph('For the best-performing device, a PCE as high as 9.48% was achieved with a Voc of 0.80 V, a Jsc of 17.46 mA/cm2, and a fill factor (FF) of 67.9%')


d4 = Paragraph('Two novel naphtho[1,2-c:5,6-c]bis(1,2,5-thiadiazole) and alkoxylphenyl substituted benzodithiophene based copolymers were developed as the donor materials for polymer solar cells and the  best device performance was achieved by P1, with an open-circuit voltage of 0.85 V,a short-circuit current density of 8.65 mA•cm−2, a fill factor of 37.8%, and a power conversion efficiency of 2.78%.')

d5 = Paragraph('PSC devices with different D/A ratios (P1/PC71BM, w/w) were fabricated to optimize the D/A ratio of the blend.Table 1 showed the photovoltaic parameters of the resulting devices under the illumination of AM 1.5G (100 mW/cm2) with different D/A ratios (1∶1, 1∶2 and 1∶3). It is clear that the optimal D/A ratio of the blend is 1∶2, and a PCE of 1.30% was obtained with a Voc of 0.92 V, a Jsc of 4.81 mA/cm2, and an FF of 29.48%. Compared with PBDT-DTNT with the same device structure (Voc＝0.80 V), a much improved Voc was obtained in P1 based devices.')

d6 = Paragraph('A maximum PCE of 4.1%, an open-circuit voltage (Voc) of 0.84 V, a short-circuit current (Jsc) of 9.8 mA cm-2, and a fill factor (FF) of 49.5% could be achieved based on a PBDTTPD:PC71BM ratio of 1:2 with a thin active-layer thickness of 90 nm')

d7 = Paragraph(' Similar Jsc values of 9.0 and 9.1 mA cm-2 and fill factors (FF) of 49.1% and 53.8% were achieved for unmodified and C70-SAM modified devices')

d8 = Paragraph('After adding 1 vol% DIO, the efficiency increased over five times to 5.18% with a higher Jsc of 7.88 mA cm−2 and a higher FF of 68.3%.')

d9 = Paragraph('Devices with 3 vol% DIO gave the highest PCE of 5.53% with a Voc of 0.98 V, a Jsc of 8.12 mA cm−2, and a FF of 69.5%.')

d10 = Paragraph('We obtained the highest device performance by using TiOx as a multifunctional interlayer between the photoactive layer and the Al electrode. Figure 4 shows the J-V curves of an optimized device measured under AM 1.5G irradiation 100 mA/cm2 and dark conditions; its efficiency parameters are as follow: Voc=0.77 V, Jsc=9.10 mA/cm2, FF=0.55, and PCE=3.80%.')


print(d1.records.serialize()) #Recognizes both Compund and property
print(d2.records.serialize()) # No compund given recognizes all values
print(d3.records.serialize()) # Recognizes value, no compound given
print(d4.records.serialize())  # Recognizes Compounds but not value
print(d5.records.serialize())  # Recognized Value but wrong compound
print(d6.records.serialize())   # Recognized Compound (it is a ratio though)                                     # and Value                                 
print(d7.records.serialize())   # Recognized one FF value but not compounds
print(d8.records.serialize())    # Recognized FF value not compound
print(d9.records.serialize())     # Recognized FF value not compound
print(d10.records.serialize())   # Recognized Compounds and value


# CURRENTLY TRYING TO FIGURE OUT DIF BETWEEN DOCUMENT AND SENTENCE SEE FIRST EXAMPLE ABOVE ESPECIALLY FOR D.CEMS V.S S.CEMS. ALSO CHECK OUT VOC AND PCE EXTRACTORS. Also numerate each input (s1,s2,s3 etc.)



[{'names': ['fullerene']}, {'ff_pattern': [{'value': '0.48'}]}]
[{'ff_pattern': [{'value': '39.4', 'units': '%'}]}, {'ff_pattern': [{'value': '37.0', 'units': '%'}]}, {'ff_pattern': [{'value': '32.1', 'units': '%'}]}]
[{'ff_pattern': [{'value': '67.9', 'units': '%'}]}]
[{'names': ['naphtho[1,2-c:5,6-c]bis(1,2,5-thiadiazole)']}, {'names': ['alkoxylphenyl']}, {'names': ['benzodithiophene']}]
[{'ff_pattern': [{'value': '29.48', 'units': '%'}]}, {'names': ['PBDT']}]
[{'names': ['PBDTTPD']}, {'ff_pattern': [{'value': '49.5', 'units': '%'}]}]
[{'ff_pattern': [{'value': '49.1', 'units': '%'}]}]
[{'ff_pattern': [{'value': '68.3', 'units': '%'}]}]
[{'ff_pattern': [{'value': '69.5', 'units': '%'}]}]
[{'names': ['TiOx']}, {'names': ['Al']}, {'ff_pattern': [{'value': '0.55'}]}]
