# Baseline : Food-drug Interaction Project

# Step 1: Load food names or compounds into a list of unique items

### Option 1 (not used): directly load csv data in Pandas dataframe

In [1]:
import pandas as pd
import numpy as np
import json

In [46]:
db_food = pd.read_csv("data/contents copy.csv", encoding = 'utf8')

In [47]:
db_food.head()

Unnamed: 0,id,orig_food_common_name
0,1,Kiwi
1,2,Onion
2,3,Onion
3,4,Chives
4,5,Cashew


In [48]:
food = db_food["orig_food_common_name"].tolist()  # somehow, converting to a set instead of list didn't work...

In [49]:
len(food)

1048575

In [87]:
foodlist = set()

for f in food:
    for term in str(f).lower().split(","):
        if len(term) >=3:  # in case some single letter or determinant is included?
            foodlist.add(term.strip())



In [68]:
i = 0
for f in foodlist:
    print f
    i += 1
    if i >20:
        break

smart balance light buttery spread
yellow
baking chocolate
swiss chard stems [red]
low salt (includes oyster
dry mix
diet strawberry kiwi
chocolate sandwich
endive [escarole]
no cholesterol
ready -to-heat
pancakes plain
standard-type
immature seeds
chocolate cake
taco with chicken
regular (10 minute)
pepeao
broiler
wholemeal
broiled


In [69]:
len(foodlist)

6143

In [70]:
with codecs.open("foodlist.txt", "w", "utf-8") as fdlist:
    for item in foodlist:
        fdlist.write(item + "\n")

### Option 2: Using Adam's pickle file (food common name as sample)

For pickling the data from foodb.ca database, see Adam's notebook **compound_food_id.ipynb**  

As a first test, we will use only the food common name (not scientific name) only. Compounds names will be added once this test passes.

In [115]:
import pickle

# food_common.pickle: Dictionary with common English food names as keys, compounds as values
test = pickle.load(open( "data/food_common.pickle", "rb"))

In [101]:
for i, item in enumerate(test.iteritems()):
    if i == 0:
        print 'food name'
        print '-'*10
    print '{0}'.format(item[0])
    if i == 10: break

food name
----------
Oregon yampah
Okra
Black mulberry
Avocado
Parsley
Elderberry
Sugar
Sweet bay
Common bean
Fig
Lard


In [104]:
foodlist_test = set(test.keys())  # Will work with a set rather than a list. Faster search for later (hash)

In [116]:
with open("foodlist.txt", "w") as fdlist:
    for item in foodlist_test:
        fdlist.write(item + "\n")

In [132]:
# Problem: there are 2 words with unusual symbols (dragee, cupuacu) and have been modified in the file and renamed
# as "foodlisr2.txt"
# Any ideas how to deal with this without changing the file manually?? Looks like codecs does not work on pickle file

foodlist = set()

with open("foodlist2.txt", "r") as fdlist2:
    for line in fdlist2:
        foodlist.add(str(line).lower().strip())


# Step 2: Filter sentences from abstract with drug keyword and food names

Note: Test with only the abstracts' first json file. Once works, we can add all 100 remaining files.

In [147]:
import pyspark
from pyspark import SparkContext
sc = SparkContext()

In [52]:
import json
# UTF-8 support
import codecs

name = "pbabstract1.json"
with codecs.open(name,"r","utf-8") as data_file:
    data = json.load(data_file)
abstractRDD = sc.parallelize(data.values())  # To load only the values and not the key (ID number)

In [29]:
abstractRDD.take(2)

[u'Heart failure still has a significant disease burden with poor outcomes worldwide despite advances in therapy. The standard therapies have been focused on blockade of renin-angiotensin-aldosterone system with angiotensin-converting enzyme inhibitors, angiotensin receptor blockers and mineralocorticoid antagonists and the sympathetic nervous system with \u03b2-blockers. The natriuretic peptide system is a potential counter-regulatory system that promotes vasodilatation and natriuresis. Angiotensin receptor neprilysin inhibitors are a new class drug capable of blocking the renin-angiotensin-aldosterone system and enhancing the natriuretic peptide system to improve neurohormonal balance. The success of the PARADIGM-HF trial with LCZ696 and its approval for heart failure treatment is likely to generate a paradigm shift. This review summarises the current knowledge of LCZ696 with a focus on pharmacology, pharmacokinetics and pharmacodynamics, mechanisms of action, clinical efficacy and s

In [53]:
# To use PubMed API
import pubmed.utils as pb

# Split abstracts to sentences
from nltk.tokenize import sent_tokenize



def splitSentences(abstract):
    sentences = sent_tokenize(abstract)
    return sentences

In [88]:
drugkeyword = "ACEI"

In [319]:
def find_ngrams(sentence, n):
    ''' Return list of ngrams from a sentence
    '''
    words_list = sentence.split()
    ngrams = zip(*[words_list[i:] for i in range(n)])
    return [''.join([str(w)+' ' for w in ngram if type(w)==str]).strip() for ngram in ngrams]

    

#example:
string = "We are working hard on 266 project baseline"
find_ngrams(string, 3)

['We are working',
 'are working hard',
 'working hard on',
 'hard on 266',
 'on 266 project',
 '266 project baseline']

In [325]:
import jellyfish

# Method 1 - NOT USED
# NOT USED since will return True if finds a food name within a word in the sentence
# E.g.: "pie" food name and "therapies" word in sentence: return True since "pie" in "therapies"
#def includeFoodCmpd(sentence, fdlist):
#    if any(word in sentence for word in fdlist):
#        return True
#    else:
#        return False

# Method 2 - solution to method 2
def includeFoodCmpd(sentence, fdlist):
    ''' Calculates the Jaro Wrinkler distance between food name and ngrams in the sentence.
        Returns True if distance > 0.95
    '''
    result = False
    for food in fdlist:
        n = min(3, len(food.split()))  # Assuming max as trigram        
        try:
            sentence = sentence.encode("utf-8")
            sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                    # last word. For now ok, since the JW will still be > 0.95
            for ngram in sentence_ngrams:
                # Note: when using jaro_winkler, need to convert into unicode format
                print food, ngram, jellyfish.jaro_winkler(u"{}".format(food.lower()), u"{}".format(ngram.lower()))
                if jellyfish.jaro_winkler(u"{}".format(food.lower()), u"{}".format(ngram.lower())) > 0.95:  
                    result = True
                    break
        except:
            next

    return result

In [326]:
# Testing includeFoodCmpd function
ss = u'Panda is eating a pie'
testlist = ["hello","pie"]
includeFoodCmpd(ss, testlist)

hello Panda 0.0
hello is 0.0
hello eating 0.455555555556
hello a 0.0
hello pie 0.511111111111
pie Panda 0.511111111111
pie is 0.0
pie eating 0.5
pie a 0.0
pie pie 1.0


True

In [330]:
# Note: tried to braodcast the foodlist but got an error message when used it in below filter
# "TypeError: 'Broadcast' object is not iterable".... any idea why?
# foodlist_bcast = sc.broadcast(foodlist)

In [333]:
sentences = abstractRDD.flatMap(splitSentences) \
                       .map(lambda a: pb.ace_substitutor(a, drugkeyword)) \
                       .filter(lambda a: drugkeyword in a)\
                       .filter(lambda a: includeFoodCmpd(a, foodlist))

In [328]:
sentences.take(2)

[u'in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides.',
 u'digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins.']

In [None]:
# sentences.collect()

In [337]:
s = "in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides."

In [338]:
def findFoodItems(sentence, foods):
    for item in foods:
        if item in sentence:
            print item

In [339]:
findFoodItems(s, foodlist)

casein
oat
whey


In [340]:
s2 = "digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins."
findFoodItems(s2, foodlist)

casein
oat
whey


## Step 3: Classify sentences as positive or negative based on a sentiment lexicon

Sentiment lexicon used is the Harvard General Inquirer (http://www.wjh.harvard.edu/~inquirer/spreadsheet_guide.htm). It contains 1,915 positive words and 2,291 negative words and is free for research use.

In [149]:
db_sentiment = pd.read_csv("data/inquirerbasic.csv", encoding = 'utf8')

  interactivity=interactivity, compiler=compiler, result=result)


In [150]:
db_sentiment.head()

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
0,A,H4Lvd,,,,,,,,,...,,,,,,,,,DET ART,| article: Indefinite singular article--some o...
1,ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,,SUPV,|
2,ABANDONMENT,H4,,Negativ,,,,,,,...,,,,,,,,,Noun,|
3,ABATE,H4Lvd,,Negativ,,,,,,,...,,,,,,,,,SUPV,|
4,ABATEMENT,Lvd,,,,,,,,,...,,,,,,,,,Noun,


In [153]:
# Look at all data fields available
for column in db_sentiment.columns:
    print column

Entry
Source
Positiv
Negativ
Pstv
Affil
Ngtv
Hostile
Strong
Power
Weak
Submit
Active
Passive
Pleasur
Pain
Feel
Arousal
EMOT
Virtue
Vice
Ovrst
Undrst
Academ
Doctrin
Econ@
Exch
ECON
Exprsv
Legal
Milit
Polit@
POLIT
Relig
Role
COLL
Work
Ritual
SocRel
Race
Kin@
MALE
Female
Nonadlt
HU
ANI
PLACE
Social
Region
Route
Aquatic
Land
Sky
Object
Tool
Food
Vehicle
BldgPt
ComnObj
NatObj
BodyPt
ComForm
COM
Say
Need
Goal
Try
Means
Persist
Complet
Fail
NatrPro
Begin
Vary
Increas
Decreas
Finish
Stay
Rise
Exert
Fetch
Travel
Fall
Think
Know
Causal
Ought
Perceiv
Compare
Eval@
EVAL
Solve
Abs@
ABS
Quality
Quan
NUMB
ORD
CARD
FREQ
DIST
Time@
TIME
Space
POS
DIM
Rel
COLOR
Self
Our
You
Name
Yes
No
Negate
Intrj
IAV
DAV
SV
IPadj
IndAdj
PowGain
PowLoss
PowEnds
PowAren
PowCon
PowCoop
PowAuPt
PowPt
PowDoct
PowAuth
PowOth
PowTot
RcEthic
RcRelig
RcGain
RcLoss
RcEnds
RcTot
RspGain
RspLoss
RspOth
RspTot
AffGain
AffLoss
AffPt
AffOth
AffTot
WltPt
WltTran
WltOth
WltTot
WlbGain
WlbLoss
WlbPhys
WlbPsyc
WlbPt
WlbTot
EnlGain
EnlLo

**Note:** some columns seem quite interesting for analysing relationship other than simply positive or negative sentiment (e.g. "causal", etc.). For the baseline, we will only use the "positive" and "negative" columns

In [156]:
# Filter only the words labeled positive or negative
positive = db_sentiment[db_sentiment.Positiv == "Positiv"].Entry.map(lambda x: x.lower()).tolist()
negative = db_sentiment[db_sentiment.Negativ == "Negativ"].Entry.map(lambda x: x.lower()).tolist()

In [157]:
positive[:4]

[u'abide', u'ability', u'able', u'abound']

In [158]:
negative[:4]

[u'abandon', u'abandonment', u'abate', u'abdicate']

In [159]:
# Transform list into sets for faster search
positive = set(positive)
negative = set(negative)

Below is an attempt to classify whether a sentence is positive or negative.  
Note the main weaknesses:  
1. It is "positively" biased for now since looks at the positive words first and if it finds it, then it immediately returns positive. Thus, it may not look at the entire sentence in case of both positive or negative words.  
2. Negation of a positive word is not taken into account

In [160]:
def includeSentiment(sentence, poslist, neglist): 
    ''' Classify sentence as positive or negative based on first word found
        in the lexicon
    '''
    if any(word in sentence for word in poslist):
        return ("positive", sentence)
    elif any(word in sentence for word in neglist):
        return ("negative", sentence)
    else:
        return ("neutral", sentence)

In [161]:
# Add sentiment as key in the RDD
sentiments = sentences.map(lambda a: includeSentiment(a, positive, negative))


In [341]:
# Peek at 2 first lines
sentiments.take(2)

[('positive',
  u'in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides.'),
 ('positive',
  u'digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins.')]

In [342]:
# Create 2 lists of filtered sentences: one positive list and one negative list
pos_sentiments = sentiments.lookup("positive")
neg_sentiments = sentiments.lookup("negative")

In [343]:
# Examples of sentences with positive sentiment lexicon
pos_sentiments[:10]

[u'in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides.',
 u'digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins.',
 u'the results introduce, for the first time, new potent ACEI-inhibitory peptides that can be released by gastric pepsin of goat milk whey and caseins and thus may pave the way for their candidacy as anti-hypertensive bioactive peptides and prevention of associated disorders.',
 u'ACEI use between 1 january 2003 and the index date were determined by the date of hospitalization for acute pancreatitis among the cases.',
 u'this meta-analysis of randomized parallel controlled trials was designed to compare the efficacy of atenolol with ACEI in changing pulse wave velocity (pwv), peripheral blood pressure and heart rate (hr) among patients with essential h

In [344]:
# Examples of sentences with negative sentiment lexicon
neg_sentiments[:2]

[u'the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition.',
 u'ACEI inhibition relies on the formation of hydrogen bonds between c-terminal residues of lentil peptides and residues of the ACEI catalytic site.']

In [345]:
def findSentiment(sentence, sentiment, poslist, neglist):
    '''Print out the lexicon word that classified the sentence as positive or negative
    '''
    if sentiment == "positive":
        lexicon = poslist
    else:
        lexicon = neglist
    for word in lexicon:
        # Note: had to use this "try/except" since there was an unicode ascii error... any ways
        # to fix this without this try/except? if left the same, then we won't be able to see
        # some sentiment word in some sentences.
        try:
            if word in sentence:
                print word
        except:
            next

In [346]:
s5 = "digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins"
findFoodItems(s5, foodlist)
findSentiment(s5, "positive", positive, negative)

casein
oat
whey
pro
generate
significant


In [347]:
s6 = "the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition"
findFoodItems(s6, foodlist)
findSentiment(s6, "negative", positive, negative)

casein
whey
inhibit
inhibition


In [348]:
def findTags(sentence, sentiment, foods, poslist, neglist):
    ''' Returns the tags of the sentence 
        Both lexicon word that classified the sentence as positive or negative and food name
    '''
    
    sent = []
    food = []
    
    if sentiment == "positive":
        lexicon = poslist
    else:
        lexicon = neglist
    for word in lexicon:
        # Note: had to use this "try/except" since there was an unicode ascii error... any ways
        # to fix this without this try/except? if left the same, then we won't be able to see
        # some sentiment word in some sentences.
        try:
            if word in sentence:
                sent.append(word)
        except:
            next
            
    # If using simple test of if food name "in" sentence method
    #for f in foods:
    #    if f in sentence:
    #        food.append(f)
    
    # If using string distance method:
    for f in foods:
        n = min(3, len(f.split()))  # Assuming max as trigram        
        try:
            sentence = sentence.encode("utf-8")
            sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                    # last word. For now ok, since the JW will still be > 0.95
            for ngram in sentence_ngrams:
                # Note: when using jaro_winkler, need to convert into unicode format
                if jellyfish.jaro_winkler(u"{}".format(f.lower()), u"{}".format(ngram.lower())) > 0.95:
                    food.append(f)
                    
        except:
            next


            
    return [sent, food, sentence]

In [197]:
# Save results in a text file
# Note: this could have been also done in Spark! But felt lazy to code... feel free to try!

In [349]:
with open("data/Positive.txt", "w") as pos:
    for sentence in pos_sentiments:
        tags = findTags(sentence, "positive", foodlist, positive, negative)
        pos.writelines(str(tags)+ "\n")
    
    

In [350]:
with open("data/Negative.txt", "w") as pos:
    for sentence in pos_sentiments:
        tags = findTags(sentence, "negative", foodlist, positive, negative)
        pos.writelines(str(tags)+ "\n")

**Final Notes**  
1. In our baseline, some words like "date" that appear in a sentence will be interpreted as the fruit "date" instead of a calendar date and thus, will be filtered as outputs sentences. This can only be solved if we take into account the context of the sentence and we will need ML to model this!  
2. Sentiment analysis need a major improvement: only basing on the positive and negative words without how the food and drug are connected through these words is not a good model.