## Test on Properties

This notebook focuses on customizing parsers that can be used to extract information

In [1]:
import logging
import re
import pandas as pd
import urllib
import time
# import feedparser
import chemdataextractor as cde
from chemdataextractor import Document
import chemdataextractor.model as model
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType
from chemdataextractor.parse.common import hyphen
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first
from chemdataextractor.parse.actions import strip_stop
from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore
from chemdataextractor.parse.cem import chemical_name
from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading
from chemdataextractor.doc.table import Table, Cell

In [2]:
# open and read files
f = open('../test_articles/paper0.pdf', 'rb')
doc = Document.from_file(f)
abstract = [11]

f1 = open('../test_articles/paper1.pdf', 'rb')
doc1 = Document.from_file(f1)
abstract1 = [7,8]

f2 = open('../test_articles/paper2.pdf', 'rb')
doc2 = Document.from_file(f2)
abstract2 = [7,8]

f3 = open('../test_articles/paper3.pdf', 'rb')
doc3 = Document.from_file(f3)
abstract3 = [10]

f4 = open('../test_articles/paper4.pdf', 'rb')
doc4 = Document.from_file(f4)
abstract4 = [12]

f5 = open('../test_articles/paper5.pdf', 'rb')
doc5 = Document.from_file(f5)
abstract5 = [3,4]

f6 = open('../test_articles/paper6.pdf', 'rb')
doc6 = Document.from_file(f6)
abstract6 = [5,6,7,8]

f7 = open('../test_articles/paper7.pdf', 'rb')
doc7 = Document.from_file(f7)
abstract7 = [11]

In [3]:
# split the paragraph into elements
paras = doc.elements
cems = doc.cems
doc.records.serialize()

[{'names': ['Isoindigo-']},
 {'names': ['bislactam']},
 {'names': ['hydrogens']},
 {'names': ['phenyl']},
 {'names': ['oxygens']},
 {'names': ['oxindoles']},
 {'names': ['triphenylamine']},
 {'names': ['phenyl- carbazole']},
 {'names': ['D − A']},
 {'names': ['4,8-bis(5-(2-ethylhexyl)- thiophen-2-yl)benzo[1,2-b:4,5-b′]dithiophene ( 2D-BDT )']},
 {'names': ['2D-BDT-containing D − π − A']},
 {'names': ['Mn [ kg mol−1 ]']},
 {'names': ['411']},
 {'names': ['34']},
 {'names': ['76']},
 {'names': ['long, branched 2- octyldodecyl alkyl']},
 {'names': ['tris(dibenzylideneacetone)dipalladium']},
 {'names': ['Mn']},
 {'names': ['benzene']},
 {'names': ['ferrocene']},
 {'names': ['−[Eonset ferrocene + 4.8 ] V']},
 {'names': ['methyl substituted alkyl chains']},
 {'names': ['methyl']},
 {'names': ['I ds']},
 {'names': ['WC o L 2']},
 {'names': ['V t']},
 {'names': ['PBDT-TIIG-']},
 {'names': ['P3HT']},
 {'names': ['PCBM']},
 {'names': ['μ e']},
 {'names': ['f [V]']},
 {'names': ['alkyl']},
 {'nam

PCE and FF works fine, as well as other quantities end in %. For other units, further customization required

Most of properties from literature have the same layout, so if one example works, the rest of them should work too.

Any unit with simple expression (1 component) is easy to extract. Otherwise a combination is needed.

In [12]:
class Jsc(BaseModel):
    value = StringType()
    units = StringType()

Compound.jsc_pattern = ListType(ModelType(Jsc))

abbrv_prefix = (I(u'jsc') | I(u'Jsc') ).hide()
words_pref = (I(u'short') + I(u'circuit') + I(u'current') + I(u'density')).hide()
hyphanated_pref = (I(u'short-circuit') + I(u'current') + I(u'density')).hide()

prefix = abbrv_prefix | words_pref | hyphanated_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'mA') + W(u'/') + W(u'cm') + W('2'))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

jsc_first= (prefix + ZeroOrMore(common_text) + value + units)(u'jsc')
jsc_second = (value + units + prefix)(u'jsc')

jsc_pattern = jsc_first | jsc_second

class JscParser(BaseParser):
    root = jsc_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            jsc_pattern=[
                Jsc(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_jsc(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(JscParser())

    cde_senteces = [Sentence(sent).records.serialize() for sent in list_of_sentences]
    return cde_senteces

In [13]:
class Mw(BaseModel):
    value = StringType()
    units = StringType()

Compound.mw_pattern = ListType(ModelType(Mw))

abbrv_prefix = (I(u'mw') | I(u'Mw')).hide()
words_pref = (I(u'molecular') + I(u'weight')).hide()
prefix = abbrv_prefix | words_pref

common_text = R('(\w+)?\D(\D+)+(\w+)?').hide()
units = (W(u'kg') + W(u"/") + W(u"mol"))(u'units')
value = R(u'\d+(\.\d+)?')(u'value')

mw_first= (prefix + ZeroOrMore(common_text) + value + units)(u'mw')
mw_second = (value + units + prefix)(u'mw')

mw_pattern = mw_first | mw_second

class MwParser(BaseParser):
    root = mw_pattern

    def interpret(self, result, start, end):
        compound = Compound(
            mw_pattern=[
                Mw(
                    value=first(result.xpath('./value/text()')),
                    units=first(result.xpath('./units/text()'))
                )
            ]
        )
        yield compound

def parse_mw (list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(MwParser())

    cde_senteces = [Sentence(sent).records.serialize() for sent in list_of_sentences]
    return cde_senteces

In [14]:
Sentence.parsers.append(JscParser())
Paragraph.parsers.append(JscParser())

Sentence.parsers.append(MwParser())
Paragraph.parsers.append(MwParser())

#Sentence.parsers.append(PceParser())
#Paragraph.parsers.append(PceParser())

In [15]:
doc = Document(
    Heading('5,10,15,20-Tetra(4-carboxyphenyl)porphyrin (3).'),
    Paragraph('m.p. 90°C.'),
    Paragraph('pce 15 %'),
    Paragraph('ff 20 %'),
    Paragraph('voc 12 V'),
    Paragraph('Mw 12 kg/mol'),
    Paragraph('Jsc 12 mA/cm2'),
    Paragraph('Melting points were measured in Tetrahydrofuran (THF).'),
)

doc.records.serialize()

[{'names': ['Tetrahydrofuran', 'THF']},
 {'names': ['5,10,15,20-Tetra(4-carboxyphenyl)porphyrin'],
  'labels': ['3'],
  'melting_points': [{'value': '90',
    'units': '°C',
    'solvent': 'Tetrahydrofuran'}],
  'mw_pattern': [{'value': '12'}]}]

In [36]:

for file in glob.glob("*.pdf"):
    print(file)
    
    f = open(file, 'rb')
    # print(f)
    doc = Document.from_file(f)
    # print(doc)
    

paper0.pdf
<_io.BufferedReader name='paper0.pdf'>
<Document: 196 elements>
paper1.pdf
<_io.BufferedReader name='paper1.pdf'>
<Document: 52 elements>
paper2.pdf
<_io.BufferedReader name='paper2.pdf'>
<Document: 195 elements>
paper3.pdf
<_io.BufferedReader name='paper3.pdf'>


KeyboardInterrupt: 

## Populating Database 

Saving unstructured information (texts, journals, etc.) to database. In this case we used mongoDB. We need to populate the database after reading processed documents as shown above.

In [42]:
from pymongo import MongoClient
import sys
import glob, os

path = '../test_articles/'
os.chdir(path)

client = MongoClient()
client = MongoClient('localhost', 27017)
db = client.pymongo_test

articles = db.articles
article_data = {
    'title': file,
    'content': doc,
    'author': 'Sam'
}
result = articles.insert_one(article_data)
print('One post: {0}'.format(result.inserted_id))

bills_post = posts.find_one({'author': 'Scott'})
print(bills_post)

One post: 5d9cb9d993c8fa5824c6a68d
{'_id': ObjectId('5d9cb9b893c8fa5824c6a687'), 'title': 'Python and MongoDB', 'content': 'PyMongo is fun, you guys', 'author': 'Scott'}
