# Baseline : Food-drug Interaction Project

In [1]:
import os
import sys
spark_home = os.environ['SPARK_HOME'] = '/Users/lisabarcelo/Downloads/spark-2.0.0-bin-hadoop2.7'
if not spark_home:
    raise ValueError('SPARK_HOME enviroment variable is not set')
sys.path.insert(0,os.path.join(spark_home,'python'))
sys.path.insert(0,os.path.join(spark_home,'python/lib/py4j-0.10.1-src.zip'))
execfile(os.path.join(spark_home,'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.0.0
      /_/

Using Python version 2.7.12 (default, Jul  2 2016 17:43:17)
SparkSession available as 'spark'.


# Step 1: Load food names or compounds into a list of unique items

### Option 1 (not used): directly load csv data in Pandas dataframe

In [198]:
import pandas as pd
import numpy as np
import json
import pickle
import codecs
from collections import defaultdict
import iteritems

ImportError: No module named iteritems

In [5]:
db_food = pd.read_csv("data/contents copy.csv", encoding = 'utf8')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
db_food.head()

Unnamed: 0,id,orig_food_common_name
0,1,Kiwi
1,2,Onion
2,3,Onion
3,4,Chives
4,5,Cashew


In [7]:
food = db_food["orig_food_common_name"].tolist()  # somehow, converting to a set instead of list didn't work...

In [8]:
len(food)

1048575

In [9]:
foodlist = set()

for f in food:
    for term in str(f).lower().split(","):
        if len(term) >=3:  # in case some single letter or determinant is included?
            foodlist.add(term.strip())



In [10]:
# i = 0
# for f in foodlist:
#     print f
#     i += 1
#     if i >20:
#         break

In [11]:
len(foodlist)

6143

In [12]:
with codecs.open("foodlist.txt", "w", "utf-8") as fdlist:
    for item in foodlist:
        fdlist.write(item + "\n")

### Option 2: Using Adam's pickle file (food common name as sample)

For pickling the data from foodb.ca database, see Adam's notebook **compound_food_id.ipynb**  

As a first test, we will use only the food common name (not scientific name) only. Compounds names will be added once this test passes.

In [13]:
import pickle

# food_common.pickle: Dictionary with common English food names as keys, compounds as values
test = pickle.load(open( "data/food_common.pickle", "rb"))

In [14]:
# for i, item in enumerate(test.iteritems()):
#     if i == 0:
#         print 'food name'
#         print '-'*10
#     print '{0}'.format(item[0])
#     if i == 10: break

In [15]:
foodlist_test = set(test.keys())  # Will work with a set rather than a list. Faster search for later (hash)

In [16]:
with open("foodlist.txt", "w") as fdlist:
    for item in foodlist_test:
        fdlist.write(item + "\n")

In [43]:
# Problem: there are 2 words with unusual symbols (dragee, cupuacu) and have been modified in the file and renamed
# as "foodlisr2.txt"
# Any ideas how to deal with this without changing the file manually?? Looks like codecs does not work on pickle file

foodlist = set()

with open("foodlist.txt", "r") as fdlist2:
    for line in fdlist2:
        foodlist.add(str(line).lower().strip())

# Step 2: Filter sentences from abstract with drug keyword and food names

Note: Test with only the abstracts' first json file. Once works, we can add all 100 remaining files.

In [18]:
# import pyspark
# from pyspark import SparkContext
# sc = SparkContext()

In [19]:
# import json
# # UTF-8 support
# import codecs

name = "pbabstract1.json"
with codecs.open(name,"r","utf-8") as data_file:
    data = json.load(data_file)
abstractRDD = sc.parallelize(data.values())  # To load only the values and not the key (ID number)

In [20]:
abstractRDD.take(2)

[u'To evaluate the economic outcomes that arose from the introduction of therapeutic reference pricing (TRP) into Slovenian practice in 2013, based on the first three therapeutic classes, namely proton-pump inhibitors (PPIs), angiotensin-converting-enzyme inhibitors (ACEIs), and lipid-lowering agents (LLAs).',
 u'National health claims data on prescription medicines from January 2011 to December 2015 were analyzed. Monthly medicine expenditure, medicine consumption, changes in medicine use, and market competition (Herfindahl-Hirschman index) were determined to assess the TRP impact on market dynamics. Interrupted time series analysis was used to assess the TRP cost-saving potential.']

In [21]:
# To use PubMed API
import pubmed.utils as pb

# Split abstracts to sentences
from nltk.tokenize import sent_tokenize

def splitSentences(abstract):
    sentences = sent_tokenize(abstract)
    return sentences

In [22]:
drugkeyword = "ACEI"

In [23]:
def find_ngrams(sentence, n):
    ''' Return list of ngrams from a sentence
    '''
    words_list = sentence.split()
    ngrams = zip(*[words_list[i:] for i in range(n)])
    return [''.join([str(w)+' ' for w in ngram if type(w)==str]).strip() for ngram in ngrams]


#example:
string = "We are working hard on 266 project baseline"
find_ngrams(string, 3)

['We are working',
 'are working hard',
 'working hard on',
 'hard on 266',
 'on 266 project',
 '266 project baseline']

In [25]:
import jellyfish

# Method 1 - NOT USED
# NOT USED since will return True if finds a food name within a word in the sentence
# E.g.: "pie" food name and "therapies" word in sentence: return True since "pie" in "therapies"
#def includeFoodCmpd(sentence, fdlist):
#    if any(word in sentence for word in fdlist):
#        return True
#    else:
#        return False

# Method 2 - solution to method 2
def includeFoodCmpd(sentence, fdlist):
    ''' Calculates the Jaro Wrinkler distance between food name and ngrams in the sentence.
        Returns True if distance > 0.95
    '''
    result = False
    for food in fdlist:
        n = min(3, len(food.split()))  # Assuming max as trigram        
        try:
            sentence = sentence.encode("utf-8")
            sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                    # last word. For now ok, since the JW will still be > 0.95
            for ngram in sentence_ngrams:
                # Note: when using jaro_winkler, need to convert into unicode format
                print food, ngram, jellyfish.jaro_winkler(u"{}".format(food.lower()), u"{}".format(ngram.lower()))
                if jellyfish.jaro_winkler(u"{}".format(food.lower()), u"{}".format(ngram.lower())) > 0.95:  
                    result = True
                    break
        except:
            next

    return result

In [26]:
# Testing includeFoodCmpd function
ss = u'Panda is eating a pie'
testlist = ["hello","pie"]
includeFoodCmpd(ss, testlist)

hello Panda 0.0
hello is 0.0
hello eating 0.455555555556
hello a 0.0
hello pie 0.511111111111
pie Panda 0.511111111111
pie is 0.0
pie eating 0.5
pie a 0.0
pie pie 1.0


True

In [27]:
# Note: tried to braodcast the foodlist but got an error message when used it in below filter
# "TypeError: 'Broadcast' object is not iterable".... any idea why?
# foodlist_bcast = sc.broadcast(foodlist)

In [45]:
sentences = abstractRDD.flatMap(splitSentences) \
                       .map(lambda a: pb.ace_substitutor(a, drugkeyword)) \
                       .filter(lambda a: drugkeyword in a)\
                       .filter(lambda a: includeFoodCmpd(a, foodlist))

In [46]:
sentences.take(2)

[u'this meta-analysis of randomized parallel controlled trials was designed to compare the efficacy of atenolol with ACEI in changing pulse wave velocity (pwv), peripheral blood pressure and heart rate (hr) among patients with essential hypertension.',
 u'using the ualdo:c and a relatively stringent definition of abt, it appears that incomplete raas blockade is common in dogs with mmvd receiving an ACEI.']

In [47]:
sentences.collect()

[u'this meta-analysis of randomized parallel controlled trials was designed to compare the efficacy of atenolol with ACEI in changing pulse wave velocity (pwv), peripheral blood pressure and heart rate (hr) among patients with essential hypertension.',
 u'using the ualdo:c and a relatively stringent definition of abt, it appears that incomplete raas blockade is common in dogs with mmvd receiving an ACEI.',
 u'we investigated the molecular mechanisms involved in the ACEI (ACEI) inhibition by (-)-epigallocatechin-3-gallate (egcg), a major tea catechin.',
 u'ACEI/arb use is common in patients initiating pd in the u.s. but was not associated with a lower risk of anuria.']

In [48]:
s = "in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides."

In [163]:
def findFoodItems(sentence, foods):
    for item in foods:
        if item in sentence:
            print item

In [164]:
findFoodItems(s, foodlist)

casein
oat
whey


In [51]:
s2 = "digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins."
findFoodItems(s2, foodlist)

casein
oat
whey


## Step 3: Classify sentences as positive or negative based on a sentiment lexicon

Sentiment lexicon used is the Harvard General Inquirer (http://www.wjh.harvard.edu/~inquirer/spreadsheet_guide.htm). It contains 1,915 positive words and 2,291 negative words and is free for research use.

In [52]:
db_sentiment = pd.read_csv("data/inquirerbasic.csv", encoding = 'utf8')

  interactivity=interactivity, compiler=compiler, result=result)


In [53]:
db_sentiment.head()

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
0,A,H4Lvd,,,,,,,,,...,,,,,,,,,DET ART,| article: Indefinite singular article--some o...
1,ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,,SUPV,|
2,ABANDONMENT,H4,,Negativ,,,,,,,...,,,,,,,,,Noun,|
3,ABATE,H4Lvd,,Negativ,,,,,,,...,,,,,,,,,SUPV,|
4,ABATEMENT,Lvd,,,,,,,,,...,,,,,,,,,Noun,


In [87]:
#To be continued... Notice that we do have some relationship words that are positive AND causaul
db_sentiment[['Entry','Positiv','Causal']].sort_values(by=['Positiv','Causal']).head()

Unnamed: 0,Entry,Positiv,Causal
95,ACCOUNTABLE,Positiv,Causal
3973,FEASIBLE,Positiv,Causal
4018,FERTILE,Positiv,Causal
5290,IMPETUS,Positiv,Causal
5412,INDICATIVE,Positiv,Causal


In [82]:
print "Number of Positive Words: ", db_sentiment['Positiv'].value_counts()
print "Number of Negative Words: ", db_sentiment['Negativ'].value_counts()
print "Number of Causaul Words: ", db_sentiment['Causal'].value_counts()

 Number of Positive Words:  Positiv    1915
Name: Positiv, dtype: int64
Number of Negative Words:  Negativ    2291
Name: Negativ, dtype: int64
Number of Causaul Words:  Causal    112
Name: Causal, dtype: int64


In [59]:
# # Look at all data fields available
# for column in db_sentiment.columns:
#     print column

**Note:** some columns seem quite interesting for analysing relationship other than simply positive or negative sentiment (e.g. "causal", etc.). For the baseline, we will only use the "positive" and "negative" columns

In [83]:
# Filter only the words labeled positive or negative
positive = db_sentiment[db_sentiment.Positiv == "Positiv"].Entry.map(lambda x: x.lower()).tolist()
negative = db_sentiment[db_sentiment.Negativ == "Negativ"].Entry.map(lambda x: x.lower()).tolist()

In [84]:
positive[:4]

[u'abide', u'ability', u'able', u'abound']

In [85]:
negative[:4]

[u'abandon', u'abandonment', u'abate', u'abdicate']

In [86]:
# Transform list into sets for faster search
positive = set(positive)
negative = set(negative)

Below is an attempt to classify whether a sentence is positive or negative.  
Note the main weaknesses:  
1. It is "positively" biased for now since looks at the positive words first and if it finds it, then it immediately returns positive. Thus, it may not look at the entire sentence in case of both positive or negative words.  
2. Negation of a positive word is not taken into account

In [88]:
def includeSentiment(sentence, poslist, neglist): 
    ''' Classify sentence as positive or negative based on first word found
        in the lexicon
    '''
    if any(word in sentence for word in poslist):
        return ("positive", sentence)
    elif any(word in sentence for word in neglist):
        return ("negative", sentence)
    else:
        return ("neutral", sentence)

In [89]:
# Add sentiment as key in the RDD
sentiments = sentences.map(lambda a: includeSentiment(a, positive, negative))

In [90]:
# Peek at 2 first lines
sentiments.take(2)

[('positive',
  u'this meta-analysis of randomized parallel controlled trials was designed to compare the efficacy of atenolol with ACEI in changing pulse wave velocity (pwv), peripheral blood pressure and heart rate (hr) among patients with essential hypertension.'),
 ('negative',
  u'using the ualdo:c and a relatively stringent definition of abt, it appears that incomplete raas blockade is common in dogs with mmvd receiving an ACEI.')]

In [91]:
# Create 2 lists of filtered sentences: one positive list and one negative list
pos_sentiments = sentiments.lookup("positive")
neg_sentiments = sentiments.lookup("negative")

In [92]:
# Examples of sentences with positive sentiment lexicon
pos_sentiments[:2]

[u'this meta-analysis of randomized parallel controlled trials was designed to compare the efficacy of atenolol with ACEI in changing pulse wave velocity (pwv), peripheral blood pressure and heart rate (hr) among patients with essential hypertension.']

In [93]:
# Examples of sentences with negative sentiment lexicon
neg_sentiments[:2]

[u'using the ualdo:c and a relatively stringent definition of abt, it appears that incomplete raas blockade is common in dogs with mmvd receiving an ACEI.',
 u'we investigated the molecular mechanisms involved in the ACEI (ACEI) inhibition by (-)-epigallocatechin-3-gallate (egcg), a major tea catechin.']

In [345]:
def findSentiment(sentence, sentiment, poslist, neglist):
    '''Print out the lexicon word that classified the sentence as positive or negative
    '''
    if sentiment == "positive":
        lexicon = poslist
    else:
        lexicon = neglist
    for word in lexicon:
        # Note: had to use this "try/except" since there was an unicode ascii error... any ways
        # to fix this without this try/except? if left the same, then we won't be able to see
        # some sentiment word in some sentences.
        try:
            if word in sentence:
                print word
        except:
            next

In [346]:
s5 = "digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins"
findFoodItems(s5, foodlist)
findSentiment(s5, "positive", positive, negative)

casein
oat
whey
pro
generate
significant


In [347]:
s6 = "the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition"
findFoodItems(s6, foodlist)
findSentiment(s6, "negative", positive, negative)

casein
whey
inhibit
inhibition


In [348]:
def findTags(sentence, sentiment, foods, poslist, neglist):
    ''' Returns the tags of the sentence 
        Both lexicon word that classified the sentence as positive or negative and food name
    '''
    
    sent = []
    food = []
    
    if sentiment == "positive":
        lexicon = poslist
    else:
        lexicon = neglist
    for word in lexicon:
        # Note: had to use this "try/except" since there was an unicode ascii error... any ways
        # to fix this without this try/except? if left the same, then we won't be able to see
        # some sentiment word in some sentences.
        try:
            if word in sentence:
                sent.append(word)
        except:
            next
            
    # If using simple test of if food name "in" sentence method
    #for f in foods:
    #    if f in sentence:
    #        food.append(f)
    
    # If using string distance method:
    for f in foods:
        n = min(3, len(f.split()))  # Assuming max as trigram        
        try:
            sentence = sentence.encode("utf-8")
            sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                    # last word. For now ok, since the JW will still be > 0.95
            for ngram in sentence_ngrams:
                # Note: when using jaro_winkler, need to convert into unicode format
                if jellyfish.jaro_winkler(u"{}".format(f.lower()), u"{}".format(ngram.lower())) > 0.95:
                    food.append(f)
                    
        except:
            next


            
    return [sent, food, sentence]

In [197]:
# Save results in a text file
# Note: this could have been also done in Spark! But felt lazy to code... feel free to try!

In [349]:
with open("data/Positive.txt", "w") as pos:
    for sentence in pos_sentiments:
        tags = findTags(sentence, "positive", foodlist, positive, negative)
        pos.writelines(str(tags)+ "\n")
    
    

In [350]:
with open("data/Negative.txt", "w") as pos:
    for sentence in pos_sentiments:
        tags = findTags(sentence, "negative", foodlist, positive, negative)
        pos.writelines(str(tags)+ "\n")

**Final Notes**  
1. In our baseline, some words like "date" that appear in a sentence will be interpreted as the fruit "date" instead of a calendar date and thus, will be filtered as outputs sentences. This can only be solved if we take into account the context of the sentence and we will need ML to model this!  
2. Sentiment analysis need a major improvement: only basing on the positive and negative words without how the food and drug are connected through these words is not a good model.

## **Slight modification to sentiment analysis**

I wanted to see if making tuples of the words found in the sentences would help.

In [165]:
def findFoods(sentence, foods):
    '''Making this a generator function'''
    for item in foods:
        if item in sentence:
            yield item

In [137]:
def maxSentiment(sentence, poslist, neglist): 
    ''' Count the number of positive and negative words in the sentence to ascertain
    the type of sentence.
    '''
    pos = 0
    neg = 0
    for word in sentence.split(" "):
        if word in poslist:
            pos += 1
        elif word in neglist:
            neg += 1
        else:
            next
    if pos > neg:
        return ('positive',sentence)
    elif neg > pos:
        return ('negative',sentence)
    else:
        return ('neutral',sentence)

In [138]:
maxSentiment('this is bad and terrible and also good', positive, negative)

('negative', 'this is bad and terrible and also good')

In [139]:
s5 = "digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins"
findFoodItems(s5, foodlist)
maxSentiment(s5, positive, negative)

casein
oat
whey


('negative',
 'digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins')

In [140]:
s6 = "the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition"
findFoodItems(s6, foodlist)
maxSentiment(s6, positive, negative)

casein
whey


('negative',
 'the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition')

## Trying to look at pairs of words

In [249]:
s5a = 'oat flakes are generally positively correlated with ACEI activity'
s5b = 'this is a dummy sentence that will not even show up in the dictionary'
s6a = 'oats and milk cause inhibition of ACEI'
s6b = 'milk upregulates ACEI'

In [200]:
def food_dict_maker(sentence):
    '''Creating a dictionary of the positive and negative relationships between foods and the drug of choice'''
    food_dict = defaultdict(list)

    #Get the food item from generator function!
    for item in list(findFoods(sentence,foodlist)):
        try:
            food_dict[(item, maxSentiment(sentence, positive, negative)[0])] += 1
        except:
            food_dict[(item, maxSentiment(sentence, positive, negative)[0])] = 1

    yield food_dict

In [246]:
def food_dict_maker2(list_of_sentences):
    '''Creating a dictionary of the positive and negative relationships between foods and the drug of choice'''
    #food_dict = defaultdict(list)
    food_dict = defaultdict(dict)

    #Get the food item from generator function!
    for sentence in list_of_sentences:
        for item in list(findFoods(sentence,foodlist)):
            try:
                #food_dict[(item, maxSentiment(sentence, positive, negative)[0])] += 1
                food_dict[item][maxSentiment(sentence, positive, negative)[0]] += 1
            except:
                #food_dict[(item, maxSentiment(sentence, positive, negative)[0])] = 1
                food_dict[item][maxSentiment(sentence, positive, negative)[0]] = 1

    yield food_dict

In [252]:
#The idea would be to pass in a TON of sentences here and you get a dictionary of dictionaries!
#You're obviously still getting the error of 'oat' with 'goat' though :(
for item in food_dict_maker2([s5a, s5b, s5, s6, s6a, s6b]):
    print item

 defaultdict(<type 'dict'>, {'oat': {'neutral': 1, 'negative': 2}, 'whey': {'negative': 2}, 'casein': {'negative': 2}})


## Here is what you could do with this dictionary

In [254]:
sample_dict = {'oat': {'neutral': 1, 'negative': 2}, 
               'whey': {'negative': 2}, 'casein': {'negative': 2}}

In [256]:
#Find relationships, use for viz
sample_dict['oat']['negative']

2