# Leave only those sentences pr PMIDs that have the food or compound name

Use Jaro-Winkler distance and ngrams to filter the sentences

Depends on hits and pmids pickle files that are created by download_food_ids.ipynb.

In [6]:
from __future__ import print_function
import sys,pickle, re, jellyfish
import pubmed.utils as pb
from collections import defaultdict
from nltk.tokenize import sent_tokenize


# load temporary results 
hits = pickle.load(open( "hits_0_4859.pickle", "rb" ) )
pmids = pickle.load(open( "pmids_0_4859.pickle", "rb" ) )

def splitSentences(abstract):
    sentences = sent_tokenize(abstract)
    return sentences

def flat_map(sentences):
    return [sent for s in sentences for sent in s]

def findFood(sentence, foods):
    ''' Calculates the Jaro Wrinkler distance between food name and ngrams in the sentence.
        Returns True if distance > limit
    '''
 
    result = False
    # If using string distance method:
    for f in foods:
        n = min(3, len(f.split()))  # Assuming max as trigram        
        sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                    # last word. For now ok, since the JW will still be > 05
        for ngram in sentence_ngrams:
            # Note: when using jaro_winkler, need to convert into unicode format
            if jellyfish.jaro_winkler(f.lower(), u"{}".format(ngram.lower())) > 0.95:
                result = True
                break
    return result

def findDrug(sentence, dglist, limit):
    ''' Calculates the Jaro Wrinkler distance between drug name and ngrams in the sentence.
        Returns True if distance > limit
    '''
    result = False
    for drug in dglist:
        n = min(5, len(drug.split()))  # Assuming max as 5-gram        
        sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                # last word. For now ok, since the JW will still be > limit
        for ngram in sentence_ngrams:
            # Note: when using jaro_winkler, need to convert into unicode format
            ngram_distance = jellyfish.jaro_winkler(u"{}".format(drug.lower()), u"{}".format(ngram.lower()))
            if ngram_distance > limit:  
                result = True
                break

    return result


def find_ngrams(sentence, n):
    ''' Return list of ngrams from a sentence
    '''
    words_list = sentence.split()
    ngrams = zip(*[words_list[i:] for i in range(n)])
    return [''.join([w+' ' for w in ngram]).strip() for ngram in ngrams]



## Build food - id - sentence dictionary
Extract only those that have fewer than 100 ids (more than those are likely not valid results)

In [7]:
foods_pmids = defaultdict(list)
# food names are between quotation marks in the search term
RE = re.compile(r'"(.*?)"')
for fc, pmid in zip(hits, pmids):
    if len(pmid) < 100:
        food_or_compound = RE.findall(fc)[0]
        foods_pmids[food_or_compound] = pmid

In [10]:
filtered_sentences = defaultdict(lambda : defaultdict(list))
N_FOODS = len(foods_pmids)

for i, items in enumerate(foods_pmids.items()):
    food, pmids = items
    # start dynamic iteration counter to track where we are 
    print('\rFood or compound item:', i+1, '/', N_FOODS,end='')
    sys.stdout.flush()
    try:
        abstracts = pb.PubMedQuery.abstract_getter(pmids)
        sentences = [splitSentences(abstract) for abstract in abstracts.values()]
        for pmid, sents in zip(pmids, sentences):
            food_filtered = filter(lambda sent: findFood(sent, [food]), sents)
            food_drug_filtered = filter(lambda sent: findDrug(sent, ['angiotensin-converting enzyme','ACE'],0.9), 
                                        food_filtered)
            if len(food_drug_filtered) > 0:
                filtered_sentences[food][pmid] = food_drug_filtered
    except:
        pass


    


Food or compound item: 557 / 557

In [12]:
filtered_sentences[u'ginger'][u'23875904']

[u"Rubra) and white ginger (Zingiber officinale Roscoe) were prepared and the ability of the extracts to inhibit ACE along with Fe(2+)- and SNP-induced lipid peroxidation was determined in rat's heart in vitro.",
 u'However, red ginger extract (EC50=27.5 \u03bcg/mL) had a significantly (P<.05) higher inhibitory effect on ACE than white ginger extract (EC50=87.0 \u03bcg/mL).',
 u'This suggests that the possible mechanism through which ginger exerts its antihypertensive properties may be through inhibition of ACE activity and prevention of lipid peroxidation in the heart.',
 u'Furthermore, red ginger showed stronger inhibition of ACE than white ginger.']