In [99]:
import pandas as pd
import numpy as np
import os
import regex as re
import ast
from collections import defaultdict
from snorkel.labeling import labeling_function

In [100]:
ABSTAIN = -1

In [101]:
# puts chemicals separated by or for regex structures
def helper_sep_chems_with_or(chemicals):
    final = ""
    for chem in chemicals:
        if (final == ""):
            final += re.escape(chem)
        else:
            final += "|" + re.escape(chem)
    return final

TESTING THE REGEX LFS

In [102]:
# includes_interconvert_sym
# If the sentence contains the interconvert symbol, we label TRUE
#
# POTENTIAL CHANGE: symbol must be inbetween two chemicals?
@labeling_function()
def includes_interconvert_sym(x):
    if("<->" in x[0]):
        return True
    if("<-->" in x[0]):
        return True
    return ABSTAIN

In [103]:
print(includes_interconvert_sym(['chemical1 <-> chemical2']))
print(includes_interconvert_sym(['chemical1 <--> chemical2']))
print(includes_interconvert_sym(['chemical1 --> chemical2']))
print(includes_interconvert_sym(['chemical1 <-- chemical2']))
print(includes_interconvert_sym(['chemical1 <-- chemical2']))

True
True
-1
-1
-1


In [106]:
# Phil's version cleaned
# structure_jtsui_pattern_1
# MODIFIED STRUCTURE AND CHANGED ADJ NUM TO 0-3
# If part of the sentence contains the specific structure
# [trigger1] <0,3> chemical [transition] <0,3> chemical, we label True

TRANS = "from|to|into|by|are|yield"
TRIG1 = "phosphoryl|condens|hydrolys|metabol|reduc|conver|produc|form|oxid|transform|bioconver|synthes|react|interconver"
TRANS_p = "(" + TRANS + ")"
TRIG1_p = "(" + TRIG1 + ")"

@labeling_function()
def structure_jtsui_pattern_1(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

    structure = r"\b" + r"{}".format(TRIG1_p) + r"\w*(\s\w*){0,3}\s" + r"{}".format(chemicals_p) + r"\s" + r"{}".format(TRANS_p) + r"(\s\w*){0,3}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN

In [107]:
# Test structure_jtsui_pattern_1
print("Test JTSUI 1")
print(structure_jtsui_pattern_1(['As for flux through serine hydroxymethyltransferase and GCS, converting serine to glycine occurred fairly rapidly, followed by GCS-mediated slow decarboxylation of the accumulated glycine',['serine', 'glycine']])) # True
print(structure_jtsui_pattern_1(['As for flux through serine hydroxymethyltransferase and GCS, converting happy serine to sad glycine occurred fairly rapidly, followed by GCS-mediated slow decarboxylation of the accumulated glycine',['serine', 'glycine']])) # True
print(structure_jtsui_pattern_1(['As for flux through serine hydroxymethyltransferase and GCS, the conversion a b c d e of serine f g h i to j k l glycine occurred fairly rapidly, followed by GCS-mediated slow decarboxylation of the accumulated glycine',['serine', 'glycine']])) # ABSTAIN
print(structure_jtsui_pattern_1(['condense chemical1 yield chemical2',['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_1(['chemical1 yield Phillium',['chemical1', 'Phillium']])) # ABSTAIN
print(structure_jtsui_pattern_1(['condense chemical1 chemical2',['chemical1', 'chemical2']])) # ABSTAIN
print(structure_jtsui_pattern_1(['conver chemical1 to chemical2',['chemical1', 'chemical2']])) # True


Test JTSUI 1
True
True
-1
True
-1
-1
True


In [109]:
# PHIL'S VERSIONS
# structure_jtsui_pattern_2
# If part of the sentence contains the specific structure
# chemical <0,1> [trigger2] <0,1> [transition] <0,1> chemical, we label True
TRIG2 = 'conver|oxid|produc|interconver'
TRIG2_p = "(" + TRIG2 + ")"
@labeling_function()
def structure_jtsui_pattern_2(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

#                           chemical                   <0,1> space             [trigger2]         <0,1>  space               [transition]           <0,1>  space         chemical          
    structure = r"\b" + r"{}".format(chemicals_p) + r"(\s\w*){0,1}\s\w*" + r"{}".format(TRIG2_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(TRANS_p) + r"(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


# structure_jtsui_pattern_3
# If part of the sentence contains the specific structure
# chemical [trigger3] <0,1> chemical, we label True
TRIG3 = 'yield|generat'
TRIG3_p = "(" + TRIG3 + ")"
@labeling_function()
def structure_jtsui_pattern_3(x):

    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"
#                           chemical               space             [trigger3]         <0,1>  space             chemical          
    structure = r"\b" + r"{}".format(chemicals_p) + r"\s\w*" + r"{}".format(TRIG3_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN



# structure_jtsui_pattern_4
# If part of the sentence contains the specific structure
# [trigger4] <0,1> chemical, we label True
TRIG4 = 'conver|interconver'
TRIG4_p = "(" + TRIG4 + ")"
@labeling_function()
def structure_jtsui_pattern_4(x):

    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"

    structure = r"\b\w*" + r"{}".format(TRIG4_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p) + r"\b"

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


# structure_jtsui_pattern_5
# If part of the sentence contains the specific structure
# chemical [transition5] <0,1> [trigger5] <0,1> chemical, we label True
TRIG5 = 'produc|metaboli'
TRIG5_p = "(" + TRIG5 + ")"
TRANS5 = 'is|are'
TRANS5_p = "(" + TRANS5 + ")"
@labeling_function()
def structure_jtsui_pattern_5(x):
    chemicals = helper_sep_chems_with_or(x[1])
    chemicals_p = "(" + chemicals + ")"
#                           chemical               space             [transition5]         <0,1>  space             [trigger5]             <0,1> space            chemical      
    structure = r"\b" + r"{}".format(chemicals_p) + r"\s\w*" + r"{}".format(TRANS5_p) + r"\w*(\s\w*){0,1}\s\w*" + r"{}".format(TRIG5_p) + r"\w*(\s\w*){0,1}\s" + r"{}".format(chemicals_p)

    if (re.search(structure, x[0])):
        return True
    return ABSTAIN


In [110]:
# Test structure_jtsui_pattern_2
print("Test JTSUI 2")
print(structure_jtsui_pattern_2(['chemical1 adj conver adj into adj chemical2', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_2(['Phillium adj conversion adj into adj chemical2', ['Phillium', 'chemical2']])) # True
print(structure_jtsui_pattern_2(['chemical1 conversion into chemical2', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_2(['conversion into chemical2', ['chemical1', 'chemical2']])) # -1
print(structure_jtsui_pattern_2(['adj conversion adj into adj chemical2', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_2(['chemical1 adj adj into adj chemical2', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_2(['chemical1 adj conver adj into adj', ['chemical1', 'chemical2']])) # True
print()


# Test structure_jtsui_pattern_3
print("Test JTSUI 3")
print(structure_jtsui_pattern_3(['chemical1 yield adj Phillium', ['chemical1', 'Phillium']])) # True
print(structure_jtsui_pattern_3(['chemical1 yield chemical2', ['chemical1', 'chemical2']])) # True
print()


# Test structure_jtsui_pattern_4
print("Test JTSUI 4")
print(structure_jtsui_pattern_4(['conver adj chemical1', ['chemical1']])) # True
print(structure_jtsui_pattern_4(['interconverting adj chemical1', ['chemical1']])) # True
print(structure_jtsui_pattern_4(['reconverting adj chemical1', ['chemical1']])) # True
print(structure_jtsui_pattern_4(['reconverting Phillium', ['Phillium']])) # True
print()

# Test structure_jtsui_pattern_5
print("Test JTSUI 5")
print(structure_jtsui_pattern_5(['Phillium are adj producing adj chemical2', ['Phillium', 'chemical2']])) # True
print(structure_jtsui_pattern_5(['chemical1 is remetabolize adj chemical2', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_5(['chemical1 is unproducing adj chemical1', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_5(['chemical1 is producing chemical2', ['chemical1', 'chemical2']])) # True
print(structure_jtsui_pattern_5(['chemical1 is adj chemical2', ['chemical1', 'chemical2']])) # -1
print(structure_jtsui_pattern_5(['chemical1 is unproducing adj', ['chemical1', 'chemical2']])) # -1

Test JTSUI 2
True
True
True
-1
-1
-1
-1

Test JTSUI 3
True
True

Test JTSUI 4
True
True
True
True

Test JTSUI 5
True
True
True
True
-1
-1


In [111]:
# What are these for?
print(structure_jtsui_pattern_3(['l-serine dehydratase sdh, a member of the beta-family of pyridoxal phosphate-dependent plp enzymes, catalyzes the deamination of l-serine and l-threonine to yield pyruvate or 2-oxobutyrate',['l-threonine', 'pyruvate', 'l-serine', '2-oxobutyrate']]))
print(structure_jtsui_pattern_3(['l-threonine to yield pyruvate', ['l-threonine', 'pyruvate']]))
print(structure_jtsui_pattern_3(['l-serine dehydratase sdh, a member of the beta-family of pyridoxal phosphate-dependent plp enzymes, catalyzes the deamination of l-serine and l-threonine to generate pyruvate or 2-oxobutyrate',['l-threonine', 'pyruvate', 'l-serine', '2-oxobutyrate']]))
print(structure_jtsui_pattern_3(['the mitochondrial respiratory chain complex iv cytochrome c oxidase is a multi-subunit enzyme that transfers electrons from cytochrome c to molecular oxygen, yielding water',['oxygen']]))
print(structure_jtsui_pattern_3(['As for flux through serine hydroxymethyltransferase and GCS, the conversion a b c d e of serine f g h i to j k l glycine occurred fairly rapidly, followed by GCS-mediated slow decarboxylation of the accumulated glycine',['serine', 'glycine']]))
print(structure_jtsui_pattern_3(['chemical1 <--> chemical2',['chemical1', 'chemical2']]))

-1
-1
-1
-1
-1
-1


In [112]:
DATA_DIR = "/Users/maxlee/chermit data"

rxns = pd.read_csv(os.path.join(DATA_DIR, "brenda_reactions_with_dois.csv"))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/maxlee/chermit data/brenda_reactions_with_dois.csv'

In [27]:
rxns.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,literatureProducts,literatureSubstrates,organismSubstrates,commentarySubstrates,reversibility,organismProducts,products,commentaryProducts,substrates,ecNumber,pubmedId,doi
0,0,0,,695811.0,Thermus sp.,1% activity compared to cyclohexanone,?,,? + NAD+,,(-)-carvone + NADH + H+,1.1.1.1,18704396.0,10.1007/s00253-008-1606-z
1,1,1,,695811.0,Thermus sp.,125% activity compared to cyclohexanol,r,,(rac)-3-methylcyclohexanone + NADH + H+,163% activity compared to cyclohexanone,"(1S,3S)-3-methylcyclohexanol + NAD+",1.1.1.1,18704396.0,10.1007/s00253-008-1606-z
2,2,2,,735498.0,Yokenella sp.,"41.5% of the activity with (2E)-but-2-enal, yi...",?,,(2E)-2-methylpent-2-en-1-ol + NADP+,,(2E)-2-methylpent-2-enal + NADPH + H+,1.1.1.1,24509923.0,10.1128/AEM.03980-13
3,3,3,,735498.0,Yokenella sp.,"37.4% of the activity with (2E)-but-2-enal, yi...",?,,"(2E)-3,7-dimethylocta-2,6-dien-1-ol + NADP+",,"(2E)-3,7-dimethylocta-2,6-dienal + NADPH + H+",1.1.1.1,24509923.0,10.1128/AEM.03980-13
4,4,4,,691867.0,Homo sapiens,substrate of isozyme ADH4,r,,"(2E)-non-2-ene-1,4-diol + NAD+",,(2E)-4-hydroxynon-2-enal + NADH + H+,1.1.1.1,18505683.0,10.1016/j.cca.2008.05.001


The first labeling functions that should provide perfect results are ones that check if a substrate/product combination is already recorded in BRENDA. 

Using the brenda_reactions CSV currently.

In [None]:
oxidationWords = ["oxidize", "oxidise", "oxidizes", "oxidises", "oxidizing", "oxidising", 
                  "oxidized", "oxidised", "oxidation", "oxidatively", "oxidant", "oxidizer", 
                  "oxidiser", "oxidative"]

This is an early prototype for an oxidation LF. It will return TRUE if any form of the word "oxidize" appears in the sentence. This LF assumes that the sentence has a potential substrate-product pair in it.

In [34]:
# have this data one folder up
uncleaned_df = pd.read_csv("sentence_annotations_elsevier_pmid_split6.csv")

In [35]:
uncleaned_df

Unnamed: 0,lit_id,indices,start,end,sentence,sentence_pos,enzymes,enzyme_locations,chemical_entities_full,chemical_names,chemical_smiles,name_smile_tuples
0,10.1002/jps.20686,0,0,1297,serial JL 313843 291210 291727 291789 291928 3...,"[('serial', 'JJ'), ('JL', 'NN'), ('313843', 'C...",,[],[],,,[]
1,10.1002/jps.20686,1,1298,1324,Published by Elsevier Inc.,"[('Published', 'VBN'), ('by', 'IN'), ('Elsevie...",,[],[],,,[]
2,10.1002/jps.20686,2,1325,1345,All rights reserved.,"[('All', 'DT'), ('rights', 'NNS'), ('reserved'...",,[],[],,,[]
3,10.1002/jps.20686,3,1346,9469,KINETICANALYSESFORSPECIESDIFFERENCESINPGLYCOPR...,[('KINETICANALYSESFORSPECIESDIFFERENCESINPGLYC...,,[],"[{'text': 'Diltiazem', 'start': 1718, 'end': 1...","Diltiazem,, Cyclosporin%20A,, Dexamethasone",COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...,"[('Diltiazem', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(..."
4,10.1002/jps.20686,4,9470,9599,Immunoblot analyses of P-gp expressed in MDR1 ...,"[('Immunoblot', 'NN'), ('analyses', 'NNS'), ('...",,[],"[{'text': 'H241', 'start': 9594, 'end': 9598, ...",H241,C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C@@H]2CC[C@@...,"[('H241', 'C[C@]12C[C@](O)(C[C@@H]1CC[C@@H]3[C..."
...,...,...,...,...,...,...,...,...,...,...,...,...
952542,10.1263/jbb.99.623,154,29565,29683,A novel ATP regeneration system using polyphos...,"[('A', 'DT'), ('novel', 'JJ'), ('ATP', 'NN'), ...","phosphotransferase, kinase","[('phosphotransferase', 9, 9), ('kinase', 12, ...","[{'text': 'ATP', 'start': 29573, 'end': 29576,...",ATP,Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P](O)(=O)O[...,"[('ATP', 'Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P..."
952543,10.1263/jbb.99.623,155,29684,29723,91 2001 557 563 20 Fujio T. Maruyama A.,"[('91', 'CD'), ('2001', 'CD'), ('557', 'CD'), ...",,[],[],,,[]
952544,10.1263/jbb.99.623,156,29724,29731,Mori H.,"[('Mori', 'NNP'), ('H', 'NNP'), ('.', '.')]",,[],"[{'text': 'H', 'start': 29729, 'end': 29730, '...",H,[H],"[('H', '[H]')]"
952545,10.1263/jbb.99.623,157,29732,29882,Production of useful substances by the couplin...,"[('Production', 'NN'), ('of', 'IN'), ('useful'...",,[],"[{'text': 'ATP', 'start': 29797, 'end': 29800,...","ATP,, ATP",Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P](O)(=O)O[...,"[('ATP', 'Nc1ncnc2n(cnc12)C3OC(CO[P](O)(=O)O[P..."
