In [1]:
#!/usr/bin/env python
# coding: utf-8

# ## Evaluation Notebook

# !python -m spacy download en_core_web_sm
#pip install spacy

import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import re
import os
from spacy import displacy
from sklearn import metrics
#from sklearn.metrics import plot_confusion_matrix

from spacy.util import filter_spans

FOODKEEPER_PATH = "datasets/FoodKeeper-Data.xls"
TRAINING_DATA_PATH = "datasets/data.csv"
MODEL_PATH = "output/model-last"
TEST_DATA_PATH = "datasets/test_data.csv"
REAL_TWITTER_DATA_PATH = "datasets/data.csv"
PREDICTED_DATA_PATH = "datasets/new_test_tweets.csv"
#STARTING_KEYWORD_COUNT = 10
#TRAINING_LOOP_ITERATIONS = 3
#REQUIRED_KEYWORDS = 3

pd.options.mode.chained_assignment = None

In [2]:
food_data = pd.read_excel(FOODKEEPER_PATH, sheet_name = "Product")
all_data = pd.read_csv(TRAINING_DATA_PATH,index_col = False, header = None)
live_tweets = pd.read_csv(PREDICTED_DATA_PATH, skiprows = [0], header=None)  
    
#Gathers all the keywords from the FoodKeeper database
#def foodKeeperInfo():
def foodKeeperInfo(spreadsheet):
    keywords = []
    #for word in food_data['Name']: # food_data['Keywords']:
    for word in spreadsheet[0]:
        word = word.replace(" or ", " ") # TO DO: break up phrases with commas like "Pastries, danish" into separate words
        word = re.sub('[/,]', ' ', word)
        word = word.lstrip()
        word = word.rstrip()
        if word.lower() not in keywords: 
            keywords.append(word.lower())

    #print("Total foodkeeper food names: " + str(len(keywords)))        
    #for element in sorted(keywords):
        #print(element)
    return keywords


def preProcess(tweet):
    #Converts a tweet to lowercase, replaces anyusername w/ <USERNAME> and URLS with <URL>
    tweet = tweet.lower()
    tweet = re.sub('@[a-zA-z0-9]*', '', tweet)              # <USERNAME>
    tweet = re.sub('http[a-zA-z0-9./:]*', '', tweet)       # <URL>
    tweet = re.sub('[.,-]*', '', tweet)
    tweet = re.sub('&amp;', 'and', tweet)
    return tweet


#---------------------------------------------

foodKeeperKeywords = foodKeeperInfo(live_tweets)
print(foodKeeperKeywords)

for i in range(len(all_data[0])):
     all_data[0][i] = preProcess(all_data[0][i])

['@tismenic70 gold (fish) finger ðÿ¥º', 'retail has taught me that those bitches in math problems- ya know  jonny bought 200 boxes of orange juice and twice that amount of apple juice- yeah those people are fucking real. yes  someone actually bought 600 boxes of juice. itâ€™s actually more common than you think', '@rock_n_rye @cardpurchaser @upperdecksports not at all but hey thatâ€™s y itâ€™s called gambling', "@euphorsla when you're making spaghetti sauce from scratch  yes sugar is necessary and it is also a banger in rice ðÿ˜œ", '@pinkyd124 oh mary ann without doubt! i mean  she could whip up a coconut cream pie with no modern conveniences ðÿ¥¥ðÿ¥§ https:  t.co j8glry41v1', "@hothotcocoa05 one thing i do is make smoothies  put in banana  then add berries other fruit  lettuce spinach  and soy other milk. the banana is sweet and the milk cuts the flavor if it's strong.", 'a rhubarb cobbler where the sweet-tart flavors sing https:  t.co y1nstix088 https:  t.co imejy6p8xn', '@josh550am 

In [3]:
# code with spacy

    
def ent_recognize(text):
    doc = nlp(text)
    displacy.render(doc,style = "ent")
    
def predict(tweet):
    doc = nlp(str(tweet))
    if doc.ents:
        displacy.render(doc,style = "ent")

# update this function to return the individual word
def returnPrediction(tweet):
    nlp = spacy.load(MODEL_PATH)
    doc = nlp(str(tweet))
    #--words = [] # lines starting with #-- are for future task of getting a list of multiple foods from a single tweet
    for word in doc.ents:
        if(word.label_ == 'FOOD'):
            #--words.append(word.text)
            return word.text
    #--return words
    #if doc.ents:
    #    return 1
    #else:
    #    return 0
    
def get_predictions():
    predictions = []
    #for tweet in test_data['tweet'].tolist():
    for tweet in live_tweets[0].tolist():
        predictions.append(str(returnPrediction(tweet)))
    return predictions
    
def eval_model():
    nlp = spacy.load(MODEL_PATH)
    predictions = get_predictions()
    #tweets = test_data['tweet'].tolist()
    tweets = live_tweets[0].tolist()
    #Displays the ratio of correct words
    numWrong = 0
    listOfIncorrect = []
    for i in range(len(predictions)):
        # logic for determining if model's prediction matches human prediction
        if predictions[i] != 'None' and str(predictions[i]) != str(live_tweets[1][i]):
            numWrong += 1
            listOfIncorrect.append("Wrong")
        elif str(predictions[i]) == str(live_tweets[1][i]) or (predictions[i] == 'None' and str(live_tweets[1][i]) == 'nan'):
            listOfIncorrect.append('Correct')
        else:
            numWrong += 1
            listOfIncorrect.append("Wrong")
    print("Ratio of incorrect predictions: ", round(numWrong/len(predictions),3), "\n")
    
    print("{:<6s}{:<20s}{:<10s}".format("Count", "Predicted Word", "Accuracy"))
    print("---------------------------")

    # Iterate over the corresponding elements of the two lists with count
    for count, (item1, item2) in enumerate(zip(predictions, listOfIncorrect), start=1):
        # Format and print each row
        print("{:<6d}{:<20s}{:<10s}".format(count, item1, item2))

def show_tp():
    counter = 0
    tweets = live_tweets[0].tolist()
    predictions = get_predictions()
    for i in range(len(predictions)):
        if str(predictions[i]) == str(live_tweets[1][i]):
            print("True positives:", tweets[i], "\n")
            counter += 1
    print("Number of True Positives: ", counter, "\n")
    print("Ratio of True Positives: ", round(counter/len(predictions),3), "\n")
    return counter
    
def show_tn():
    counter = 0
    predictions = get_predictions()
    tweets = live_tweets[0].tolist()
    for i in range(len(predictions)):
        if predictions[i] == 'None' and str(live_tweets[1][i]) == 'nan':
            print("True Negative:", tweets[i], "\n")
            counter += 1
    print("Number of True Negatives: ", counter, "\n")
    print("Ratio of True Negatives: ", round(counter/len(predictions),3), "\n")
    return counter
    
def show_fn():
    predictions = get_predictions()
    tweets = live_tweets[0].tolist()
    counter = 0
    for i in range(len(predictions)):
        if predictions[i] == "None" and str(live_tweets[1][i]) != 'nan':
            print("False Negative:", tweets[i], "\n")
            counter += 1
    print("Number of False Negatives: ", counter, "\n")
    print("Ratio of False Negatives: ", round(counter/len(predictions),3), "\n")
    return counter
    
def show_fp():
    predictions = get_predictions()
    tweets = live_tweets[0].tolist()
    counter = 0
    for i in range(len(predictions)):
        if predictions[i] != "None" and str(live_tweets[1][i]) == 'nan':
            counter += 1
            print("False Positive:")
            doc = nlp(str(tweets[i]))
            if doc.ents:
                displacy.render(doc,style = "ent")
    print("Number of False Positives: ", counter, "\n")
    print("Ratio of False Positives: ", round(counter/len(predictions),3), "\n")
    return counter

#test_data = pd.read_csv(TEST_DATA_PATH)
test_data = pd.read_csv(PREDICTED_DATA_PATH)

#y = test_data['food'].tolist()
nlp = spacy.load(MODEL_PATH)
#nlp = spacy.blank("en")
#print(nlp.pipe_names)
#ent_recognize("My rice cakes is tasty")
#ent_recognize(" peanut butter and jelly is the classic for me stuff that includes fruit sugars (like apples) are a good choice too imo")


# # Checking for overlapping words
   
#     print(data)
# keywords = foodKeeperInfo()
# testdata = convertToTrainingFormat("My rice cakes is tasty ", keywords)

# # for word in keywords:
# #     for word2 in wordsInFoodkeeper:
# #         if word in word2 and word != word2:
# #             print(word,word2)



# ## Use the function below to check individual sentences
# nlp = spacy.load("en_core_web_sm")
# ent_recognize("my friend is chicken because he is scared")
# print(live_tweets)
# testTweets = live_tweets[5]
# for tweet in testTweets[:500]:
#     if nlp(preProcess(tweet)).ents:
#         ent_recognize(preProcess(tweet))


# ## Use the function below to check model performance on the entire test set
eval_model()

# ## Use the functions below to see TP, TN, FP, FN respectively
tp = show_tp()
tn = show_tn()
fn = show_fn()
fp = show_fp()

print("Precision: ", tp/(tp+fp))
print("Recall: ", tp/(tp+fn))

# foodkeeper = foodKeeperInfo()
# print(foodkeeper)
# sortedKeywords =  sorted(keywordRanker, key=keywordRanker.get, reverse=True)

# for i in range(15): #sortedKeywords
#     keywords.append(sortedKeywords[i])
# print(keywords)
# keywords.append("chicken")
# #keywords.append("cream cheese")

# abc = convertToTrainingFormat("I like to eat cream and cheese with chicken test", keywords)
# print(abc)

# # See what keywords are found by the created model
# keywordsFound = []
# nlp = spacy.load(MODEL_PATH)
# print(nlp.pipeline)
# for tweet in live_tweets[5]: #test_data['tweet']:
#     modeledTweet = nlp(preProcess(tweet))
#     for token in modeledTweet.doc.ents:
#         if str(token) in keywordsFound: continue
#         keywordsFound.append(str(token))


# # If the model finds food keywords print the Tweet
# To help visualize which keywords are being found this loop iterates through the Tweets testing the model to see what keywords it finds. If it finds a keyword in the Tweet it will print the Tweet and highlight the keyword

# for tweet in test_data['tweet'][:500]:
#     ents = nlp(preProcess(tweet))
#     #if ents.doc.ents:
#     ent_recognize(preProcess(tweet))



Ratio of incorrect predictions:  0.552 

Count Predicted Word      Accuracy  
---------------------------
1     None                Correct   
2     boxes               Wrong     
3     None                Correct   
4     spaghetti sauce     Correct   
5     coconut cream       Wrong     
6     berries             Wrong     
7     None                Correct   
8     None                Correct   
9     water               Wrong     
10    mango               Wrong     
11    Cinnamon            Wrong     
12    turkey bacon        Wrong     
13    Last                Wrong     
14    None                Wrong     
15    None                Correct   
16    flour               Wrong     
17    None                Correct   
18    pecans              Wrong     
19    None                Correct   
20    barbecue sauce      Correct   
21    cinnamon            Wrong     
22    beef                Wrong     
23    cherimoya           Wrong     
24    None                Correct   
25    

True positives: @EUPHORSlA When you're making spaghetti sauce from scratch, yes sugar is necessary and it is also a banger in rice ðŸ˜Œ 

True positives: @Manya_bts7 Nope, but I can't stand spicy food or barbecue sauce so ðŸ˜¬ 

True positives: @SmokeAndSomno @kiwi_sunset I refuse to thank fruit 

True positives: @MINSWH00RE mustard is nasty as hell and doesnâ€™t go with anything 

True positives: @mushHUNT1 Thank you my friend ðŸ˜Š I am looking forward to taking many pictures of mushrooms with cameras. 

True positives: this lady got a can of mini vegetables like for soups and broths and just dumped it out into a bowl and now sheâ€™s eating it like itâ€™s cereal. i could scream 

True positives: @ElbowRoomier Brazilian pulled pork, fried yuca, collard greens, yams with chorizo, and ginger beer for dinner and movies on the TV with bae. 

True positives: Cooking is happening. DW trying to caramelize (blacken?) onions for lentils and rice. â€œNot for Rockyâ€ being said a lot. https://t.

True Negative: @tismenic70 Gold (fish) finger ðŸ¥º 

True Negative: @Rock_n_Rye @CardPurchaser @UpperDeckSports Not at all but hey thatâ€™s y itâ€™s called gambling 

True Negative: A Rhubarb Cobbler Where the Sweet-Tart Flavors Sing https://t.co/y1NStIx088 https://t.co/imeJy6p8XN 

True Negative: @josh550am WE LIKE YOU GARDEN SALSA WE REALLY LIKE YOU 

True Negative: @shanshan_cha HELLO LEMONS DAD CAN I COME AND BUY BAMBOO SHOOTS FROM YOU 

True Negative: @anorexiccore WHAT !! ITS NOT FAIR ugh canâ€™t they make phones with all the good qualities damn 

True Negative: @sophandbud @giblets_s Maybe we should ask #WeeWillyWinkie??? #FreeWilly ... 

True Negative: CELERY ROOT #deliciousandnutritious shop now at https://t.co/qlZsutdoV8 https://t.co/2JQrEo8NTn 

True Negative: Kitchen Waitress Serves Dirty Anchovies https://t.co/HFmb5tKJyi 

True Negative: @Toastcat4618 Polenta!!! Delicious! Enjoy! 

True Negative: @kittykatlady505 @MysterySolvent soooo pork rinds? 

True Negative: https://t

KeyboardInterrupt: 