In [31]:
import logging
import re
import pandas as pd
import urllib
import time

import chemdataextractor as cde
from chemdataextractor import Document
from chemdataextractor.reader import acs,base,cssp,HtmlReader,NlmXmlReader,PdfReader,RscHtmlReader,XmlReader
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first

from chemdataextractor.parse.actions import strip_stop, merge, join
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any
from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [30]:
#PCE Parser

class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

# prefix = abbrv_prefix | words_pref | hyphanated_pref
common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide()
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency') | I(u'efﬁciency')).hide()
hyphanated_pref = (I(u'power') + I(u'-') + I('conversion') + I(u'efficiency') | I(u'efﬁciency')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

# prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('PCEs') + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I('range'))) + Optional((I('temperature') | I('range')))
#                                     ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

prefix = Optional(I('a')).hide() + (Optional(lbrct) + abbrv_prefix + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I(u'efﬁciency') | I('range') | words_pref)) + Optional((I('temperature') | I('range')))
                                    ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional (W('thus')) + Optional (W('reached')) + Optional (W('result')) + Optional (W('up')) + Optional(W('=') | W('¼') | I('of') | I('was') | I('is') | I('at') | I('to')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

pce_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'pce')
# pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (prefix + value + units)(u'pce')
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern=[
                Pce(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound
    
Sentence.parsers.append(PceParser())
Paragraph.parsers.append(PceParser())

    

In [17]:
# Sample Sentences For Regex testing

d1 = Paragraph('for annealed device with 93% RR, JSC = 6.28 mA cm−2, VOC = 0.6 V, FF= 39.4%, PCE= 1.8%; for not-annealed device with 90.7% RR, JSC = 3.27 mA cm−2, VOC = 0.62 V, FF= 37.0%, PCE= 0.9%; for annealed device with 90.7% RR, JSC = 3.07 mA cm−2,VOC = 0.61 V, FF= 32.1%, PCE= 0.7%')

d2 = Paragraph('PBDT-DTNT exhibited a much improved PCE of 6.0%')

d3 = Paragraph('the PCE of PBDT-DTNT was further enhanced to 8.4%.')

d4 = Paragraph('P1 was investigated and the best device performance was obtained in an inverted device structure and a PCE value of 2.78% with a Voc of 0.85 V were achieved.')

d5 = Paragraph('Under optimal processing condition, the PCE of device based on P1 was 2.78% with a Voc of 0.85 V, a Jsc of 8.65 mA/cm2 and an FF of 37.8 %')

d6 = Paragraph('DIO showed a PCE of 2.78% ')

d7 = Paragraph('2 Here, by choosing the appropriate electron-rich unit terthiophene as the donor and isoindigo as the acceptor, we synthesized a new alternating polymer, P3TI, that shows PCEs of up to 6.3% in PSCs with PC71BM as the acceptor')

d8 = Paragraph('The PV performance of P3TI was investigated in PSCs in the standard configuration glass/ITO/PEDOT:PSS/P3TI:PC71BM/ LiF/Al. The P3TI:PC71BM weight ratio was optimized from 1:1 to 1:1.5 to 1:2, and the PV characteristics are summarized in Table 1. PSCs were first fabricated with DCB as the processing solvent for the active layer and showed a PCE of 4.8% for the optimized P3TI:PC71BM weight ratio of 1:1.5')

d9 = Paragraph('A PCE of 6.3% with Voc of 0.70 V, a shortcircuit current density (Jsc) of 13.1 mA cm-2, and a decent FF of 0.69 was obtained for devices containing the 1:1.5 blend as the photoactive layer (thickness of 90 nm) and processed with the additive, under AM 1.5G simulated solar light illumination(100 mW cm-2)')

d10 = Paragraph( 'In summary, an easily accessible alternating copolymer of terthiophene and isoindigo, P3TI, was designed and synthesized. The polymer presents a promising absorption spectrum and appropriate HOMO/LUMO positions, leading to PCEs of up to 6.3% in the resulting PSCs. ')

d11 = Paragraph('Hydrogen is literally Power Conversion Energy 5%')

#print(d1.records.serialize()) #Recognized all percentages no Compound given
#print(d2.records.serialize()) #Recognized Compound and Percentage
#print(d3.records.serialize()) # Recognized Name but not percentage
#print(d4.records.serialize())  # Recognized nothing
#print(d5.records.serialize())   # Recognized Nothing
#print(d6.records.serialize())   # Recognized percentage but not Compound
#print(d7.records.serialize())  # Recognized Compounds but not plural form of                                    # property name
#print(d8.records.serialize()) # Cannot attribute Compound to specific PCE value
#print(d9.records.serialize())  # No compound given recognized percentage
#print(d10.records.serialize()) # Recognized Compounds but not plural form of                                    # property name
print(d11.records.serialize()) # Simple example of order Regex looks for (Compound then value then units)

[{'names': ['Hydrogen']}]


In [15]:
# Using Sentence Tokenization to parse through the whole article and extract relevant compounds to PCE values, eventually can be implemented to the automated method of looping through the files in 'One Compound' Folder

from chemdataextractor.doc import Paragraph
f = open('2.pdf', 'rb') #Extracting pdf article from One Compound folder
doc = Document.from_file(f) 
#a = doc.records.serialize() # Returns the list object [] w/ a dictionary/maps {}
para = doc.elements # Outputs list of paragaph objects
sentence_records = []
for p in para: # stores all sentences from paragraphs
    p.sentences
    #print(p.sentences)
    sentence_records.append(p.records.serialize())
#print(sentence_records)
pce_sentences = []
contains_pce = False 
for sentence in sentence_records: # Everything the sentence contains
    #print(sentence)
    for words in sentence: #everything inside the sentences is broken up into categories based on label
        #print(words)
        for word_label in words:
            if word_label == 'pce_pattern':
                contains_pce = True
    if contains_pce:
        pce_sentences.append(sentence)
        contains_pce = False
for sentence in pce_sentences:
    for words in sentence:
        print(y)

# STOPPING POINT: NEED TO ASSOCIATE THE PCE VALUES WITH A COMPOUND JUST LIKE WHEN WHOLE PAPER IS EXTRACTED WITHOUT SENTENCE TOKENIZATION 


{'names': ['PBDT']}
{'names': ['ITO']}
{'names': ['PEDOT']}
{'names': ['Mg']}
{'names': ['Al']}
{'names': ['1,8-diiodooc- tane', 'DIO']}
{'names': ['methanol']}
{'names': ['PBDT']}
{'names': ['DIO']}
{'pce_pattern': [{'value': '9.19', 'units': '%'}]}
{'pce_pattern': [{'value': '9.12', 'units': '%'}]}
{'pce_pattern': [{'value': '8.60', 'units': '%'}]}
{'names': ['PBDT-TS1']}
{'names': ['PBDT-TS1']}
{'names': ['PBDT-TS1:PC71BM']}
{'pce_pattern': [{'value': '9.48', 'units': '%'}]}
{'names': ['PBDT-TS1'], 'roles': ['product']}
{'pce_pattern': [{'value': '10', 'units': '%'}]}


In [36]:
from chemdataextractor.nlp.tokenize import ChemWordTokenizer
from chemdataextractor.doc import Paragraph 
 
para = Paragraph('In addition to grafting side chains, substitution of the carbon atom in selected locations also affects the energy levels of a polymer. In the recent work26, higher values of Voc are observed when ﬂuorine, an atom of high electron afﬁnity, is introduced to the thieno[3,4-b]thiophene unit, a PCE of 6.1% having been demonstrated26. To this end, PBDTTT–C was modiﬁed with a ﬂuorine atom to lower its HOMO level. The structure of the designed and synthesized PBDTTT–CF is shown in Fig. 1a. The HOMO and LUMO of PBDTTT–CF were measured and compared. J–V curves, a signiﬁcant increase in Voc is clearly observed from PBDTTT–E to PBDTTT–CF. AVoc as high as 0.76 V was observed indevicesbasedonPBDTTT–CF.CombinedwithitshighJsc andﬁll factor (FF), a high PCE of 7.38+0.4% (a 5% device variation), measured in more than 75 devices, was achieved in the PBDTTT– CF system, the highest measured PCE being 7.73%. Devices were then encapsulated and sent to the National Renewable Energy Laboratory (NREL) for certiﬁcation.')
ps = para.sentences

for x in ps:
    print(x.records.serialize())

    print('end of sentence')



[{'names': ['carbon']}]
end of sentence
[{'names': ['ﬂuorine']}, {'names': ['thieno[3,4-b]thiophene']}, {'pce_pattern': [{'value': '6.1', 'units': '%'}]}]
end of sentence
[{'names': ['PBDTTT–C']}, {'names': ['ﬂuorine']}]
end of sentence
[{'names': ['PBDTTT']}]
end of sentence
[{'names': ['PBDTTT']}]
end of sentence
[{'names': ['PBDTTT–E']}, {'names': ['PBDTTT']}]
end of sentence
[{'names': ['indevicesbasedonPBDTTT–CF.CombinedwithitshighJsc']}, {'names': ['PBDTTT']}]
end of sentence
[]
end of sentence


In [22]:
# General method of extracting information from full article 
from chemdataextractor.nlp.tokenize import ChemWordTokenizer
from chemdataextractor.doc import Paragraph

f = open('2.pdf', 'rb')
doc = Document.from_file(f)
para = doc.elements # Outputs list of paragaph objects
print(para)
#sentence_records = []
#for p in para: # stores all sentences from paragraphs
    #p.sentences
    #print(p.sentences)
    #sentence_records.append(p.records.serialize())
#print(sentence_records)




#a = doc.records.serialize()
#a

 

[Paragraph(id=None, references=[], text='Communication'), Paragraph(id=None, references=[], text='pubs.acs.org/cm'), Paragraph(id=None, references=[], text='†,‡'), Paragraph(id=None, references=[], text='†\nShaoqing Zhang,'), Paragraph(id=None, references=[], text='Highly Eﬃcient 2D-Conjugated Benzodithiophene-Based\nPhotovoltaic Polymer with Linear Alkylthio Side Chain\nand Jianhui Hou*,†\nLong Ye,\n†\nState Key Laboratory of Polymer Physics and Chemistry, Beijing National Laboratory for Molecular Sciences, Institute of Chemistry,\nChinese Academy of Sciences, Beijing 100190, China\n‡\nUniversity of Chinese Academy of Sciences, Beijing 100049, China\n*S Supporting Information'), Paragraph(id=None, references=[], text='†\nWenchao Zhao,'), Paragraph(id=None, references=[], text='Huifeng Yao,'), Paragraph(id=None, references=[], text='†,‡'), Paragraph(id=None, references=[], text='R ecently, polymer solar cells (PSCs) have been the subject'), Paragraph(id=None, references=[], text='of ex

In [44]:
# Automated Method of looping through 'One Compound' Folder

import os
import pickle

path = "/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/one_compound"
entries = os.listdir('/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/one_compound')
counter = 0
docs = []
for file in os.listdir("/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/one_compound"):
    filename = os.fsdecode(file)
    filepath = os.path.join("/Users/walid/OneDrive/Documents/GitHub/AutoDataMining/Named Entity Recognition/one_compound/", filename)
    doc = Document.from_file(filepath)
    docs.append(doc)
    counter = counter + 1
    print('a')
    if counter > 5:
        break
for doc in docs:
    print(doc.records.serialize())
    
    


a
a
a
a
a
a
[{'names': ['poly(2,5-bis(3-tetradecyllthiophen-2- yl)thieno[3,2,- ]thiophene )']}, {'names': ['1)PC(cid']}, {'names': ['4)BM(cid']}, {'names': ['1)SCLP(cid']}, {'names': ['poly-3-hexylthiophene']}, {'names': ['1)2,5-bis(cid:1)3- recombination.9 tetradecyllthiophen-2-yl(cid:2)thieno(cid:3)3,2-b(cid:4)thiophene(cid']}, {'names': ['di-']}, {'names': ['1)SCLC(cid']}, {'names': ['1)Jsc(cid']}, {'names': ['1)Voc(cid']}, {'names': ['1)FF(cid']}, {'names': ['Indium tin oxide']}, {'names': ['1)ITO(cid']}, {'names': ['acetone']}, {'names': ['isopropyl alcohol']}, {'names': ['1)3,4-ethylenedioxythiophene(cid']}, {'names': ['1)styrenesulfonate(cid']}, {'names': ['oxygen']}, {'names': ['1)Nano-C(cid']}, {'names': ['1)PCEs(cid']}, {'names': ['argon']}, {'names': ['1)PL(cid']}, {'names': ['PCBM, ITO']}, {'names': ['di- odes']}, {'names': ['palladium']}, {'names': ['1)HOMO(cid']}, {'names': ['cid : 1)J-V(cid']}, {'names': ['Xe']}, {'names': ['1)AM(cid']}, {'names': ['NREL']}, {'names': ['

In [None]:
# Element Tree method might need to continue

from lxml import etree
root = etree.fromstring(fstring, parser=XMLParser(recover=True,encoding=get_encoding(fstring)))   



In [None]:
# Tuning Regex 
# Goal 1: PCE in plural tense
# Goal 2: Full name "Power Conversion Energy"
# Goal 3: Order of presentation (Compound, PCE value, units)

import logging
import re

from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first
from chemdataextractor.parse.actions import strip_stop
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore
from chemdataextractor.parse.cem import chemical_name
from chemdataextractor.doc import Paragraph, Sentence

class Pce(BaseModel):
    value = StringType()
    units = StringType()

Compound.pce_pattern = ListType(ModelType(Pce))

# prefix = abbrv_prefix | words_pref | hyphanated_pref | plural_pref | fullname_pref
common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'%') | I(u'percent'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide()
words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency') | I(u'efficiency')).hide()
hyphanated_pref = (I(u'power') + I(u'-') + I('conversion') + I(u'efficiency') | I(u'efﬁciency')).hide()
joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge)
spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') +
                                                                        R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge)
to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') +
                                                                    R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join)

prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('PCEs') + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I('range'))) + Optional((I('temperature') | I('range')))
                                    ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional(W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

prefix = Optional(I('a')).hide() + (Optional(lbrct) + abbrv_prefix + Optional(rbrct) | I('power') + Optional(I('conversion')) + Optional((I('efficiency') | I(u'efﬁciency') | I('range') | words_pref)) + Optional((I('temperature') | I('range')))
                                    ).hide() + Optional(lbrct + W('PCE') + rbrct) + Optional (W('thus')) + Optional (W('reached')) + Optional (W('result')) + Optional (W('up')) + Optional(W('=') | W('¼') | I('of') | I('was') | I('is') | I('at') | I('to')).hide() + Optional(I('in') + I('the') + I('range') + Optional(I('of')) | I('about') | ('around') | I('%')).hide()

pce_first  = (words_pref + (Optional(lbrct) + abbrv_prefix + Optional(rbrct)) + ZeroOrMore(common_text) + value + units)(u'pce')
# pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce')
pce_second = (prefix + value + units)(u'pce')
pce_pattern = pce_first | pce_second

class PceParser(BaseParser):
    root = pce_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            pce_pattern=[
                Pce(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound
    
Sentence.parsers.append(PceParser())
Paragraph.parsers.append(PceParser())



# Potenital taggers to add
 #: Regular expression patterns in (regex, tag) tuples.
    patterns = [
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),   # articles
        (r'.*able$', 'JJ'),                # adjectives
        (r'.*ness$', 'NN'),                # nouns formed from adjectives
        (r'.*ly$', 'RB'),                  # adverbs
        (r'.*s$', 'PCEs or NNS'),                  # plural nouns
        (r'.*ing$', 'VBG'),                # gerunds
        (r'.*ed$', 'VBD'),                 # past tense verbs
        (r'.*', 'NN')                      # nouns (default)
    ]

