# Data Filtering : Food-drug Interaction Project

## Drug: Statin
## Food: common food

# Step 1: Load food names or compounds into a list of unique items

In [16]:
import pandas as pd
import numpy as np
import json
import pickle
import json
import jellyfish

# UTF-8 support
import codecs

# To use PubMed API
import pubmed.utils as pb

# Split abstracts to sentences
from nltk.tokenize import sent_tokenize

import random


from collections import defaultdict
import re

### Functions for Step 1

In [30]:
def replaceNonASCII(text):
    ''' Replace all non ASCII characters by 'unk'
        This is to deal with Unicode encode/decode bug when using PySpark
    '''
    if text is not None:
        return ''.join([i if ord(i) < 128 else 'unk' for i in text])

    
def list_loader(data):
    ''' Load pickle file
        Returns keys of the loaded dictionary as a set
    '''
    test = pickle.load(open( data , "rb"))
    foodlist = []
    for food in test.keys():
        food = replaceNonASCII(food.decode('utf-8').lower())
        foodlist.append(food)
        
    return set(foodlist)  # Will work with a set rather than a list. Faster search for later (hash)

### Using Adam's pickle file (food common name as sample)

For pickling the data from foodb.ca database, see Adam's notebook **compound_food_id.ipynb**  

As a first test, we will use only the food common name (not scientific name) only. Compounds names will be added once this test passes.

In [3]:
# food_common.pickle: Dictionary with common English food names as keys, compounds as values
test = pickle.load(open( "data/food_common.pickle", "rb"))

In [4]:
# Quick peek at the loaded data
for i, item in enumerate(test.iteritems()):
    if i == 0:
        print 'food name'
        print '-'*10
    print '{0}'.format(item[0])
    if i == 10: break

food name
----------
Oregon yampah
Okra
Black mulberry
Avocado
Parsley
Elderberry
Sugar
Sweet bay
Common bean
Fig
Lard


In [5]:
foodlist = list_loader('data/food_common.pickle')

In [6]:
i = 0
for food in foodlist:
    print food
    i += 1
    if i > 5:
        break

atlantic pollock
mixed nuts
rose hip
black mulberry
pheasant
whiting


# Step 2: Create list of words for drug keyword

In [7]:
drug_options = ['statin', 'statins', 'HMG-CoA reductase inhibitors', 'HMG-CoA reductase inhibitor', 
                'HMG-CoA reductase-inhibitors',  'HMG-CoA reductase-inhibitor',
                'HMG-CoA-reductase-inhibitors',  'HMG-CoA-reductase-inhibitor',
                'HMGA reductase inhibitors', 'HMGA reductase inhibitor',
                'HMG-CoA reductase', 'HMG-CoA-reductase', 'HMG CoA reductase',
                '(HMG-CoA) reductase inhibitors', '(HMG-CoA) reductase inhibitor', '(HMG-CoA) reductase',
                'hydroxy-methylglutaryl-coenzyme A reductase', '(HMG-CoA reductase)',
                'atorvastatin', 'fluvastatin', 'lovastatin', 'pitavastatin', 
                'pravastatin', 'rosuvastatin', 'simvastatin',
                'atorvastatins', 'fluvastatins', 'lovastatins', 'pitavastatins', 
                'pravastatins', 'rosuvastatins', 'simvastatins']

# Step 3: Filter sentences from abstract with drug keyword and food names

Note: Test with only the abstracts' first json file. Once works, we can add all 100 remaining files.

In [8]:
import pyspark
from pyspark import SparkContext
sc = SparkContext()

### Functions for Step 3

In [9]:
name = ['pbabstract1.json']
data_list = []
for jsonfile in name:
    with open('statins_pbabstract/'+jsonfile, "r") as data_file:
        data = json.load(data_file)
        values = [replaceNonASCII(value) for value in data.values()]
        data_list.extend(values)


    


In [10]:
print data_list[:2]

[u'Clinically stable patients who underwent DES implantation 12 months previously and received aspirin monotherapy were randomly assigned to receive either high-intensity (40mg atorvastatin, n = 1000) or low-intensity (20mg pravastatin, n = 1000) statin treatment. The primary endpoint was adverse clinical events at 12-month follow-up (a composite of all death, myocardial infarction, revascularization, stent thrombosis, stroke, renal deterioration, intervention for peripheral artery disease, and admission for cardiac events).', u'The primary endpoint at 12-month follow-up occurred in 25 patients (2.5%) receiving high-intensity statin treatment and in 40 patients (4.1%) receiving low-intensity statin treatment (HR, 0.58; 95%CI, 0.36-0.92; P = .018). This difference was mainly driven by a lower rate of cardiac death (0 vs 0.4%, P = .025) and nontarget vessel myocardial infarction (0.1 vs 0.7%, P = .033) in the high-intensity statin treatment group.']


In [None]:
name = []
for jsonfile in name:
    with codecs.open(statins_pbabstract/jsonfile,"r","utf-8") as data_file:
        data = json.load(data_file)
        values = [replaceNonASCII(value) for value in data.values()]
        data_list.extend(values)
            
    return data_list

In [27]:
def abstract_loader(name):
    ''' Loads abstracts from json files in provided list
    '''
    data_list = []
    for jsonfile in name:
        with codecs.open('statins_pbabstract/'+jsonfile,"r","utf-8") as data_file:  # Note: changed directory of data!
            data = json.load(data_file)
            values = [replaceNonASCII(value) for value in data.values()]
            data_list.extend(values)
            
    return data_list


def splitSentences(abstract):
    sentences = sent_tokenize(abstract)
    return sentences


def find_ngrams(sentence, n):
    ''' Return list of ngrams from a sentence
    '''
    words_list = sentence.split()
    ngrams = zip(*[words_list[i:] for i in range(n)])
    #return [''.join([unicode(w)+' ' for w in ngram if type(w)==unicode]).strip() for ngram in ngrams]
    return [''.join([w+' ' for w in ngram]).strip() for ngram in ngrams]


#######################
# Method 1 - NOT USED #
#######################

# NOT USED since will return True if finds a food name within a word in the sentence
# E.g.: "pie" food name and "therapies" word in sentence: return True since "pie" in "therapies"
#def includeFoodCmpd(sentence, fdlist):
#    if any(word in sentence for word in fdlist):
#        return True
#    else:
#        return False


###################################
# Method 2 - solution to method 2 #
###################################

def includeDrug(sentence, dglist, limit, verbose=False):
    ''' Calculates the Jaro Wrinkler distance between drug name and ngrams in the sentence.
        Returns True if distance > limit
    '''
    result = False
    for drug in dglist:
        n = min(5, len(drug.split()))  # Assuming max as 5-gram        
        sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                # last word. For now ok, since the JW will still be > limit
        for ngram in sentence_ngrams:
            # Note: when using jaro_winkler, need to convert into unicode format
            ngram_distance = jellyfish.jaro_winkler(u"{}".format(drug.lower()), u"{}".format(ngram.lower()))
            if verbose:
                print drug, ngram, ngram_distance
            if ngram_distance > limit:  
                result = True
                break

    return result



def includeFoodCmpd(sentence, fdlist, limit, verbose=False):
    ''' Calculates the Jaro Wrinkler distance between food name and ngrams in the sentence.
        Returns True if distance > 0.95
    '''
    result = False
    for food in fdlist:
        n = min(3, len(food.split()))  # Assuming max as trigram        
        sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                # last word. For now ok, since the JW will still be > 0.95
        for ngram in sentence_ngrams:
            # Note: when using jaro_winkler, need to convert into unicode format
            ngram_distance = jellyfish.jaro_winkler(food.lower(), u"{}".format(ngram.lower()))
            if verbose:
                print food, ngram, ngram_distance
            if ngram_distance >= limit:  
                result = True
                break

    return result


def findFood(sentence, foods):
    ''' Returns the food names found in the sentence based on string distance method
    '''
    
    food = []

    # If using string distance method:
    for f in foods:
        n = min(3, len(f.split()))  # Assuming max as trigram        
        sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                # last word. For now ok, since the JW will still be > 0.95
        for ngram in sentence_ngrams:
            # Note: when using jaro_winkler, need to convert into unicode format
            if jellyfish.jaro_winkler(f.lower(), u"{}".format(ngram.lower())) > 0.95:
                food.append(f)
            
    return food


# Not used since we will use the string distance for filtering
def findFoodItems(sentence, foods):
    for item in foods:
        if item in sentence:
            print item
            
def is_relevant(sentence, irrelevant_list, fdlist):
    ''' Filter sentences with only relevant food words
    '''
    foods = findFood(sentence, fdlist)
    if any(word in foods for word in irrelevant_list):
        return False
    else:
        return True

In [10]:
# Test find_ngrams:
string = u"We are working hard on 266 project baseline"
find_ngrams(string, 3)

[u'We are working',
 u'are working hard',
 u'working hard on',
 u'hard on 266',
 u'on 266 project',
 u'266 project baseline']

In [11]:
# Testing includeFoodCmpd function
ss = u'Panda is eating a pie'
testlist = [u"hello",u"pie"]
includeFoodCmpd(ss, testlist, 0.98, verbose=True)


hello Panda 0.0
hello is 0.0
hello eating 0.455555555556
hello a 0.0
hello pie 0.511111111111
pie Panda 0.511111111111
pie is 0.0
pie eating 0.5
pie a 0.0
pie pie 1.0


True

In [15]:
# Testing includeDrug function
ss = u'Panda is eating a pie'
testlist = [u"hello",u"pie"]
includeDrug(ss, testlist, 0.99, verbose=True)

hello Panda 0.0
hello is 0.0
hello eating 0.455555555556
hello a 0.0
hello pie 0.511111111111
pie Panda 0.511111111111
pie is 0.0
pie eating 0.5
pie a 0.0
pie pie 1.0


True

In [12]:
# Load all abstracts - there are 414 json files from PubMed
filename_list = []
for i in xrange(1, 415):
    filename_temp = "pbabstract" + str(i)+ ".json"
    filename_list.append(filename_temp)

abstract = abstract_loader(filename_list)

In [13]:
abstractRDD = sc.parallelize(abstract)\
                .filter(lambda a: a is not None)  #Some lines were empty: need to filter out
abstractRDD.take(2)

[u'Clinically stable patients who underwent DES implantation 12 months previously and received aspirin monotherapy were randomly assigned to receive either high-intensity (40mg atorvastatin, n = 1000) or low-intensity (20mg pravastatin, n = 1000) statin treatment. The primary endpoint was adverse clinical events at 12-month follow-up (a composite of all death, myocardial infarction, revascularization, stent thrombosis, stroke, renal deterioration, intervention for peripheral artery disease, and admission for cardiac events).',
 u'The primary endpoint at 12-month follow-up occurred in 25 patients (2.5%) receiving high-intensity statin treatment and in 40 patients (4.1%) receiving low-intensity statin treatment (HR, 0.58; 95%CI, 0.36-0.92; P = .018). This difference was mainly driven by a lower rate of cardiac death (0 vs 0.4%, P = .025) and nontarget vessel myocardial infarction (0.1 vs 0.7%, P = .033) in the high-intensity statin treatment group.']

In [330]:
# Note: tried to braodcast the foodlist but got an error message when used it in below filter
# "TypeError: 'Broadcast' object is not iterable".... any idea why?
# foodlist_bcast = sc.broadcast(foodlist)

In [184]:
# If substituting drugs related words by "ACEI" then filter
#drugkeyword = "ACEI"

#sentences = abstractRDD.flatMap(splitSentences) \
#                       .map(lambda a: pb.ace_substitutor(a, drugkeyword)) \
#                       .filter(lambda a: drugkeyword in a)\
#                       .filter(lambda a: includeFoodCmpd(a, foodlist, 0.98))

#%%time
#filtered_sentences = sentences.collect()
### results:
### CPU times: user 71.2 ms, sys: 30.3 ms, total: 101 ms
### Wall time: 21min 30s

#len(filtered_sentences) -> 938

In [14]:
sentences = abstractRDD.flatMap(splitSentences) \
                       .filter(lambda a: includeDrug(a, drug_options , 0.99))\
                       .filter(lambda a: includeFoodCmpd(a, foodlist, 0.98))

In [15]:
sentences.take(2)

[u'18 samples of red yeast rice powder and 18 samples of lovastatin were collected.',
 u'In this study, we demonstrate the efficacy of topical statin treatment in reducing scar in our validated rabbit ear scar model.']

In [16]:
%%time
filtered_sentences = sentences.collect()

CPU times: user 113 ms, sys: 52.4 ms, total: 166 ms
Wall time: 38min 26s


In [17]:
len(filtered_sentences)

835

In [53]:
# Save filtered sentences containing drug and food name in textfile
#with open("data/FilteredSentences_ACEI_commonFood.txt", "w") as outcomes:
#    for sentence in filtered_sentences:
#        food = findFood(sentence, foodlist)
#        outcomes.writelines(str(food) + sentence + "\n")

In [18]:
# Delete some sentences with irrelevant food names
irrelevant = ['date', 'rabbit', 'shortening', 'arepas', 'water', 'cocktail']

relevant_sentences = sc.parallelize(filtered_sentences)\
                       .filter(lambda a: is_relevant(a, irrelevant, foodlist))\
                       .collect()


In [19]:
len(relevant_sentences)

370

In [20]:
with open("data/FilteredSentences_Statin_commonFood_clean.txt", "w") as outcomes:
    for sentence in relevant_sentences:
        food = findFood(sentence, foodlist)
        outcomes.writelines(str(food) + sentence + "\n")

## Step 4: Classify sentences as positive or negative based on a sentiment lexicon

There are 467 sentences containing both ACE Inhibitor drug and a food name from Step 2. They will be labeled and then compared to the 2 baseline models classification for model evaluation.

## BASELINE 1: 50/50 coin flip classification

Our first model is to simply classify the interaction between drug and food in the sentence as positive, negative or neutral based on equal probability of 1/3 in each class.  

Evaluation metrics: the labels of 100 randomly chosen sentences from outcomes in step 2 (i.e. sentences with drug and food names) will be compared with the random labels from baseline model.

In [21]:
classification = [(random.choice(["positive", "negative", "neutral"]), sentence) for sentence in relevant_sentences]

# -------------- SECTION BELOW HAS NOT BEEN UPDATED --------------

## BASELINE 2: Sentiment lexicon

Sentiment lexicon used is the Harvard General Inquirer (http://www.wjh.harvard.edu/~inquirer/spreadsheet_guide.htm). It contains 1,915 positive words and 2,291 negative words and is free for research use.

In [208]:
db_sentiment = pd.read_csv("data/inquirerbasic.csv", encoding = 'utf8')

  interactivity=interactivity, compiler=compiler, result=result)


In [209]:
db_sentiment.head()

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
0,A,H4Lvd,,,,,,,,,...,,,,,,,,,DET ART,| article: Indefinite singular article--some o...
1,ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,,SUPV,|
2,ABANDONMENT,H4,,Negativ,,,,,,,...,,,,,,,,,Noun,|
3,ABATE,H4Lvd,,Negativ,,,,,,,...,,,,,,,,,SUPV,|
4,ABATEMENT,Lvd,,,,,,,,,...,,,,,,,,,Noun,


In [210]:
# Look at all data fields available
for column in db_sentiment.columns:
    print column

Entry
Source
Positiv
Negativ
Pstv
Affil
Ngtv
Hostile
Strong
Power
Weak
Submit
Active
Passive
Pleasur
Pain
Feel
Arousal
EMOT
Virtue
Vice
Ovrst
Undrst
Academ
Doctrin
Econ@
Exch
ECON
Exprsv
Legal
Milit
Polit@
POLIT
Relig
Role
COLL
Work
Ritual
SocRel
Race
Kin@
MALE
Female
Nonadlt
HU
ANI
PLACE
Social
Region
Route
Aquatic
Land
Sky
Object
Tool
Food
Vehicle
BldgPt
ComnObj
NatObj
BodyPt
ComForm
COM
Say
Need
Goal
Try
Means
Persist
Complet
Fail
NatrPro
Begin
Vary
Increas
Decreas
Finish
Stay
Rise
Exert
Fetch
Travel
Fall
Think
Know
Causal
Ought
Perceiv
Compare
Eval@
EVAL
Solve
Abs@
ABS
Quality
Quan
NUMB
ORD
CARD
FREQ
DIST
Time@
TIME
Space
POS
DIM
Rel
COLOR
Self
Our
You
Name
Yes
No
Negate
Intrj
IAV
DAV
SV
IPadj
IndAdj
PowGain
PowLoss
PowEnds
PowAren
PowCon
PowCoop
PowAuPt
PowPt
PowDoct
PowAuth
PowOth
PowTot
RcEthic
RcRelig
RcGain
RcLoss
RcEnds
RcTot
RspGain
RspLoss
RspOth
RspTot
AffGain
AffLoss
AffPt
AffOth
AffTot
WltPt
WltTran
WltOth
WltTot
WlbGain
WlbLoss
WlbPhys
WlbPsyc
WlbPt
WlbTot
EnlGain
EnlLo

**Note:** some columns seem quite interesting for analysing relationship other than simply positive or negative sentiment (e.g. "causal", etc.). For the baseline, we will only use the "positive" and "negative" columns

In [211]:
# Filter only the words labeled positive or negative
positive = db_sentiment[db_sentiment.Positiv == "Positiv"].Entry.map(lambda x: x.lower()).tolist()
negative = db_sentiment[db_sentiment.Negativ == "Negativ"].Entry.map(lambda x: x.lower()).tolist()

In [212]:
positive[:4]

[u'abide', u'ability', u'able', u'abound']

In [213]:
negative[:4]

[u'abandon', u'abandonment', u'abate', u'abdicate']

In [214]:
# Transform list into sets for faster search
positive = set(positive)
negative = set(negative)

Below is an attempt to classify whether a sentence is positive or negative.  
Note the main weaknesses:  
1. It is "positively" biased for now since looks at the positive words first and if it finds it, then it immediately returns positive. Thus, it may not look at the entire sentence in case of both positive or negative words.  
2. Negation of a positive word is not taken into account

In [215]:
def includeSentiment(sentence, poslist, neglist): 
    ''' Classify sentence as positive or negative based on first word found
        in the lexicon
    '''
    if any(word in sentence for word in poslist):
        return ("positive", sentence)
    elif any(word in sentence for word in neglist):
        return ("negative", sentence)
    else:
        return ("neutral", sentence)

In [216]:
# Add sentiment as key in the RDD

filtered_sentencesRDD = sc.parallelize(filtered_sentences)
sentiments = filtered_sentencesRDD.map(lambda a: includeSentiment(a, positive, negative))


In [217]:
# Peek at 2 first lines
sentiments.take(2)

[('positive',
  u'in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides.'),
 ('positive',
  u'digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins.')]

In [218]:
# Create 2 lists of filtered sentences: one positive list and one negative list
pos_sentiments = sentiments.lookup("positive")
neg_sentiments = sentiments.lookup("negative")

In [219]:
# Examples of sentences with positive sentiment lexicon
pos_sentiments[:10]

[u'in this study, we examined the separated caseins and whey proteins of goat milk for the presence of ACEI inhibitory peptides.',
 u'digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins.',
 u'the peptides from whey and caseins exert significant ACEI inhibitory activities comparable to that of captopril, an antihypertensive drug, exhibiting ic50 values of 4.45unkunkm and 4.27unkunkm, respectively.',
 u'the results introduce, for the first time, new potent ACEI-inhibitory peptides that can be released by gastric pepsin of goat milk whey and caseins and thus may pave the way for their candidacy as anti-hypertensive bioactive peptides and prevention of associated disorders.',
 u'ACEI use between 1 january 2003 and the index date were determined by the date of hospitalization for acute pancreatitis among the cases.',
 u'compared to hydrolysat

In [220]:
# Examples of sentences with negative sentiment lexicon
neg_sentiments[:2]

[u'the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition.',
 u'we investigated the molecular mechanisms involved in the ACEI (ACEI) inhibition by (-)-epigallocatechin-3-gallate (egcg), a major tea catechin.']

In [221]:
def findSentiment(sentence, sentiment, poslist, neglist):
    '''Print out the lexicon word that classified the sentence as positive or negative
    '''
    if sentiment == "positive":
        lexicon = poslist
    else:
        lexicon = neglist
    for word in lexicon:
        # Note: had to use this "try/except" since there was an unicode ascii error... any ways
        # to fix this without this try/except? if left the same, then we won't be able to see
        # some sentiment word in some sentences.
        try:
            if word in sentence:
                print word
        except:
            next

In [223]:
s5 = u"digestion of isolated whey proteins and caseins of goat milk by gastric pepsin generated soluble hydrolysates exhibiting significant inhibition of ACEI compared to weak inhibition by undigested proteins"
print findFood(s5, foodlist)
findSentiment(s5, "positive", positive, negative)

[u'casein', u'whey']
pro
generate
significant


In [224]:
s6 = u"the late-eluting fraction (f4) of either whey or caseins exhibited greater ACEI inhibition"
print findFood(s6, foodlist)
findSentiment(s6, "negative", positive, negative)

[u'casein', u'whey']
inhibit
inhibition


In [225]:
def findTags(sentence, sentiment, foods, poslist, neglist, limit):
    ''' Returns the tags of the sentence 
        Both lexicon word that classified the sentence as positive or negative and food name
    '''
    
    sent = []
    food = []
    
    if sentiment == "positive":
        lexicon = poslist
    else:
        lexicon = neglist
    for word in lexicon:
        # Note: had to use this "try/except" since there was an unicode ascii error... any ways
        # to fix this without this try/except? if left the same, then we won't be able to see
        # some sentiment word in some sentences.
        try:
            if word in sentence:
                sent.append(word)
        except:
            next
            
    # If using simple test of if food name "in" sentence method
    #for f in foods:
    #    if f in sentence:
    #        food.append(f)
    
    # If using string distance method:
    for f in foods:
        n = min(3, len(f.split()))  # Assuming max as trigram        
        sentence_ngrams = find_ngrams(sentence, n)  # Note: punctuation at end of sentence will be included with
                                                # last word. For now ok, since the JW will still be > 0.95
        for ngram in sentence_ngrams:
            # Note: when using jaro_winkler, need to convert into unicode format
            if jellyfish.jaro_winkler(f.lower(), u"{}".format(ngram.lower())) > limit:
                food.append(f)

            
    return [sent, food, sentence]

In [197]:
# Save results in a text file
# Note: this could have been also done in Spark! But felt lazy to code... feel free to try!

In [226]:
with open("data/Positive.txt", "w") as pos:
    for sentence in pos_sentiments:
        tags = findTags(sentence, "positive", foodlist, positive, negative, 0.95)
        pos.writelines(str(tags)+ "\n")
    
    

In [227]:
with open("data/Negative.txt", "w") as pos:
    for sentence in pos_sentiments:
        tags = findTags(sentence, "negative", foodlist, positive, negative, 0.95)
        pos.writelines(str(tags)+ "\n")

**Final Notes**  
1. In our baseline, some words like "date" that appear in a sentence will be interpreted as the fruit "date" instead of a calendar date and thus, will be filtered as outputs sentences. This can only be solved if we take into account the context of the sentence and we will need ML to model this!  
2. Sentiment analysis need a major improvement: only basing on the positive and negative words without how the food and drug are connected through these words is not a good model.

## BELOW: Latest Update as of 8-4-2017 [Lisa B]

## Sentiment Classifier Using Stanford Model

#### First import the sentences, and then run the sentiment analysis on them via command line

In [4]:
!head data/FilteredSentences_Statin_commonFood_clean.txt

[u'rice']18 samples of red yeast rice powder and 18 samples of lovastatin were collected.
[u'rice']: Purpose: Red yeast rice (RYR) supplementation has become a popular alternative to statin therapy in treating hypercholesterolemia.
[u'rice']We examined whether FDA's manufacturing standards led to standard concentrations of the statin monacolin K in red yeast rice supplements.
[u'rice']Red yeast rice contains a fungus (Monascus purpureus), which was utilized in the original production of lovastatin (MEVACOR, Merck & Co, Whitehouse Station, NJ), the first marketed pharmaceutical statin, and is chemically identical to such product.
[u'oyster mushroom', u'rice', u'mushrooms']Matrix effect-free UHPLC-MS/MS method was developed and validated for the determination of cholesterol-lowering lovastatin in food samples represented by Pu-erh tea, oyster mushroom, and red yeast rice.
[u'rice']Suitability of the resulting MISPE-UHPLC-MS/MS procedure for real sample analysis was verified by the d

# Testing with PCFG Model*

*notice I made a change to the lexparser file to allow for more memory

In [9]:
!cat ./lexparser.sh

#!/usr/bin/env bash
#
# Runs the English PCFG parser on one or more files, printing trees only

if [ ! $# -ge 1 ]; then
  echo Usage: `basename $0` 'file(s)'
  echo
  exit
fi

scriptdir=`dirname $0`

java -mx500m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \
 -outputFormat "penn,typedDependencies" edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz $*


In [10]:
%%timeit
! ./lexparser.sh  data/FilteredSentences_Statin_commonFood_clean.txt

[main] INFO edu.stanford.nlp.parser.lexparser.LexicalizedParser - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... done [0.8 sec].
Parsing file: data/FilteredSentences_Statin_commonFood_clean.txt
Parsing [sent. 1 len. 21]: -LSB- u ` rice ' -RSB- 18 samples of red yeast rice powder and 18 samples of lovastatin were collected .
(ROOT
  (S
    (S
      (NP (JJ -LSB-) (NN u) (`` `) (NN rice) ('' '))
      (VP (VBZ -RSB-)
        (NP
          (NP (CD 18) (NNS samples))
          (PP (IN of)
            (NP (JJ red) (NN yeast) (NN rice) (NN powder))))))
    (CC and)
    (S
      (NP
        (NP (CD 18) (NNS samples))
        (PP (IN of)
          (NP (NN lovastatin))))
      (VP (VBD were)
        (VP (VBN collected))))
    (. .)))

amod(rice-4, -LSB--1)
compound(rice-4, u-2)
nsubj(-RSB--6, rice-4)
root(ROOT-0, -RSB--6)
nummod(samples-8, 18-7)
dobj(-RSB--6, samples-8)
case(powder-13, of-9)
amod(powder-13, red-10)
compound(powder-13, yeast-11)
comp

### Command Line Parser

In [11]:
!java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP \
-annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref,sentiment \
-file data/FilteredSentences_Statin_commonFood_clean.txt \
-outputFormat text

[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.TokenizerAnnotator - No tokenizer type provided. Defaulting to PTBTokenizer.
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [1.4 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ner
[main] INFO edu.stanford.nlp.ie.AbstractSequenceClassifier - Loading classifier from edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz ... done [1.9 sec].
[main] INFO edu.stanford.nlp.ie.AbstractSequenceClassifier - Loading classifier from edu/stanford/nlp/models/ner/english.muc.7cla

## Let's see a few sentences:

In [12]:
!head FilteredSentences_Statin_commonFood_clean.txt.out

Sentence #1 (21 tokens, sentiment: Negative):
[u'rice']18 samples of red yeast rice powder and 18 samples of lovastatin were collected.
[Text=-LSB- CharacterOffsetBegin=0 CharacterOffsetEnd=1 PartOfSpeech=-LRB- Lemma=-lsb- NamedEntityTag=O SentimentClass=Neutral]
[Text=u CharacterOffsetBegin=1 CharacterOffsetEnd=2 PartOfSpeech=FW Lemma=u NamedEntityTag=O SentimentClass=Neutral]
[Text=` CharacterOffsetBegin=2 CharacterOffsetEnd=3 PartOfSpeech=`` Lemma=` NamedEntityTag=O SentimentClass=Neutral]
[Text=rice CharacterOffsetBegin=3 CharacterOffsetEnd=7 PartOfSpeech=NN Lemma=rice NamedEntityTag=O SentimentClass=Neutral]
[Text=' CharacterOffsetBegin=7 CharacterOffsetEnd=8 PartOfSpeech='' Lemma=' NamedEntityTag=O SentimentClass=Neutral]
[Text=-RSB- CharacterOffsetBegin=8 CharacterOffsetEnd=9 PartOfSpeech=-RRB- Lemma=-rsb- NamedEntityTag=O SentimentClass=Neutral]
[Text=18 CharacterOffsetBegin=9 CharacterOffsetEnd=11 PartOfSpeech=CD Lemma=18 NamedEntityTag=NUMBER NormalizedNamedEntityTag=

## Creating Dictionaries

In [17]:
co_occurrence_dict = defaultdict(list)

for line in open('FilteredSentences_Statin_commonFood_clean.txt.out').readlines():
    if 'Sentence #' in line:
        sentence = str(line.strip('\n')).split(" ")[1]
        sentiment = str(line.strip('\n')).split(":")[1]
        sentiment = re.sub("\d+","",re.sub(r'[^\w\s]','',sentiment))
    elif line[0:5] == '[Text':
        word = str(line.split("=")[1]).split(" ")[0].lower()
        pos = str(line.split("=")[4]).split(" ")[0]
        co_occurrence_dict[(sentiment, sentence)].append(word)

## Creating Dictionary for Visualization

In [31]:
foodlist = list_loader('data/food_common.pickle')

In [106]:
treeData = defaultdict(list)

treeData["name"] =  "Statins"
treeData["parent"] = "null"

#making tiny child dictionary
treeData_b = defaultdict(list)
treeData_b["name"] =  "Negative"
treeData_b["parent"] = "Statins"

treeData_c = defaultdict(list)
treeData_c["name"] =  "Very Negative"
treeData_c["parent"] = "Statins"


for k, v in co_occurrence_dict.iteritems():
    sentiment = str(k[0]).strip(" ")
        #Negative branch
    if sentiment == 'Negative':
        for vv in v:
            if vv in foodlist:
                if vv in set([treeData_b["children"][i]["name"] for i in range(len(treeData_b["children"]))]):
                    pass
                else:
                    treeData_b["children"].append({"parent":sentiment,
                        "name":vv})

    elif sentiment == 'Very negative':        
        for vv in v:
            if vv in foodlist:
                if vv in set([treeData_c["children"][i]["name"] for i in range(len(treeData_c["children"]))]):
                    pass
                else:
                    treeData_c["children"].append({"parent":sentiment,
                        "name":vv})
    else:
        next
        
    #Adding all children dictionaries
treeData["children"].append(treeData_b)
treeData["children"].append(treeData_c)

        
treeData

defaultdict(list,
            {'children': [defaultdict(list,
                          {'children': [{'name': 'rice', 'parent': 'Negative'},
                            {'name': 'grapefruit', 'parent': 'Negative'},
                            {'name': 'spread', 'parent': 'Negative'},
                            {'name': 'gelatin', 'parent': 'Negative'},
                            {'name': 'eggs', 'parent': 'Negative'},
                            {'name': 'pomegranate', 'parent': 'Negative'},
                            {'name': 'coffee', 'parent': 'Negative'},
                            {'name': 'margarine', 'parent': 'Negative'},
                            {'name': 'olive', 'parent': 'Negative'},
                            {'name': 'sunflower', 'parent': 'Negative'},
                            {'name': 'casein', 'parent': 'Negative'},
                            {'name': 'tea', 'parent': 'Negative'},
                            {'name': 'opossum', 'parent': 'Negative'},
       