In [2]:
#!/usr/bin/env python
# coding: utf-8

# ## Evaluation Notebook

# !python -m spacy download en_core_web_sm
#pip install spacy
import random
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin
import re
import os
from spacy import displacy
from sklearn import metrics
#from sklearn.metrics import plot_confusion_matrix

from spacy.util import filter_spans

FOODKEEPER_PATH = "datasets/FoodKeeper-Data.xls"
TRAINING_DATA_PATH = "datasets/data.csv"
MODEL_PATH = "output/model-last"
TEST_DATA_PATH = "datasets/test_data.csv"
REAL_TWITTER_DATA_PATH = "datasets/data.csv"
PREDICTED_DATA_PATH = "datasets/new_test_tweets.csv"
#STARTING_KEYWORD_COUNT = 10
#TRAINING_LOOP_ITERATIONS = 3
#REQUIRED_KEYWORDS = 3

pd.options.mode.chained_assignment = None

In [3]:
food_data = pd.read_excel(FOODKEEPER_PATH, sheet_name = "Product")
all_data = pd.read_csv(TRAINING_DATA_PATH,index_col = False, header = None)
#live_tweets = pd.read_csv(PREDICTED_DATA_PATH, skiprows = [0], header=None)  
checked_tweets = pd.read_csv(TEST_DATA_PATH, skiprows = [0], header=None)  
print(checked_tweets)
#Gathers all the keywords from the FoodKeeper database
#def foodKeeperInfo():
def foodKeeperInfo(spreadsheet):
    keywords = []
    #for word in food_data['Name']: # food_data['Keywords']:
    for word in spreadsheet[0]:
        word = word.replace(" or ", " ") # TO DO: break up phrases with commas like "Pastries, danish" into separate words
        word = re.sub('[/,]', ' ', word)
        word = word.lstrip()
        word = word.rstrip()
        if word.lower() not in keywords: 
            keywords.append(word.lower())

    #print("Total foodkeeper food names: " + str(len(keywords)))        
    #for element in sorted(keywords):
        #print(element)
    return keywords


def preProcess(tweet):
    #Converts a tweet to lowercase, replaces anyusername w/ <USERNAME> and URLS with <URL>
    tweet = tweet.lower()
    tweet = re.sub('@[a-zA-z0-9]*', '', tweet)              # <USERNAME>
    tweet = re.sub('http[a-zA-z0-9./:]*', '', tweet)       # <URL>
    tweet = re.sub('[.,-]*', '', tweet)
    tweet = re.sub('&amp;', 'and', tweet)
    return tweet


#---------------------------------------------

foodKeeperKeywords = foodKeeperInfo(checked_tweets)
#print(foodKeeperKeywords)

#for i in range(len(all_data[0])):
     #all_data[0][i] = unProcess(all_data[0][i])

# select tweets which contain one of the foodKeeperKeywords
selected_tweets = []
for tweet in all_data[0]:
    for keyword in foodKeeperKeywords:
        if all(word in tweet for word in keyword):
            selected_tweets.append(tweet)
            break
# then select 250 random tweets of these; this is the test_data
test_data = random.sample(selected_tweets, 250)
test_data_df = pd.DataFrame(test_data, columns=['Random Tweet'])
# save test_data in a "new_test_tweets.csv" file
test_data_df.to_csv('datasets/random_test_tweets.csv', index=False)

# Read the random_test_tweets.csv file
random_test_data_df = pd.read_csv('datasets/random_test_tweets.csv')

# Read the new_test_tweets.csv file
new_test_data_df = pd.read_csv('datasets/new_test_tweets.csv')

# Update the 'Tweet' column in new_test_data_df with the random tweets from random_test_data_df
new_test_data_df['Tweet'] = random_test_data_df['Random Tweet']

# Save the updated new_test_data_df to the new_test_tweets.csv file
new_test_data_df.to_csv('datasets/new_test_tweets.csv', index=False)

                                                    0  1
0   just microwaved a kashi chicken and spinach th...  1
1   @amyg0716 thats really sad i wolud hate that! ...  1
2   @DTizzler and it took me my entire walk to the...  0
3   just finished cooking spag bol from scratch.. ...  1
4   Oh noooooo Kath is back from Annual Leave!!!!!...  0
..                                                ... ..
78  sick roomie gave me her cold my throats sore (...  0
79  No flying to Ponca City today for breakfast Oa...  1
80  just walked by marks&amp;spencers food n didn'...  1
81  if lucas till and taylor swift start dating i ...  0
82                          Bed time. Back to reality  0

[83 rows x 2 columns]


In [4]:
# code with spacy

    
def ent_recognize(text):
    doc = nlp(text)
    displacy.render(doc,style = "ent")
    
def predict(tweet):
    doc = nlp(str(tweet))
    if doc.ents:
        displacy.render(doc,style = "ent")

# update this function to return the individual word
def returnPrediction(tweet):
    nlp = spacy.load(MODEL_PATH)
    doc = nlp(str(tweet))
    #--words = [] # lines starting with #-- are for future task of getting a list of multiple foods from a single tweet
    for word in doc.ents:
        if(word.label_ == 'FOOD'):
            #--words.append(word.text)
            return word.text
    #--return words
    #if doc.ents:
    #    return 1
    #else:
    #    return 0
    
def get_predictions():
    predictions = []
    #for tweet in test_data['tweet'].tolist():
    for tweet in checked_tweets[0].tolist():
        predictions.append(str(returnPrediction(tweet)))
    return predictions
predictions = get_predictions()
def eval_model():
    nlp = spacy.load(MODEL_PATH)
    #predictions = get_predictions()
    #tweets = test_data['tweet'].tolist()
    tweets = new_test_data_df['Tweet'].tolist()
    #Displays the ratio of correct words
    numWrong = 0
    listOfIncorrect = []
    for i in range(len(checked_tweets)):
        # logic for determining if model's prediction matches human prediction
        if str(predictions[i]) != 'None' and checked_tweets[1][i] == 0:
            numWrong += 1
            listOfIncorrect.append("Wrong")
        elif str(predictions[i]) != 'None' and checked_tweets[1][i] == 1:
            listOfIncorrect.append('Correct')
        elif str(predictions[i]) == 'None' and checked_tweets[1][i] == 0:
            listOfIncorrect.append('Correct')
        else:
            numWrong += 1
            listOfIncorrect.append("Wrong")
    print("Ratio of incorrect predictions: ", round(numWrong/len(checked_tweets),3), "\n")
    
    print("{:<6s}{:<20s}{:<10s}".format("Count", "Predicted Word", "Accuracy"))
    print("---------------------------")

    # Iterate over the corresponding elements of the two lists with count
    for count, (item1, item2) in enumerate(zip(predictions, listOfIncorrect), start=1):
        # Format and print each row
        print("{:<6d}{:<20s}{:<10s}".format(count, item1, item2))

def show_tp():
    counter = 0
    #tweets = checked_tweets[0].tolist()
    #predictions = get_predictions()
    for i in range(len(checked_tweets)):
        if str(predictions[i]) != 'None' and checked_tweets[1][i] == 1:
            #print("True positives:", tweets[i], "\n")
            counter += 1
    print("Number of True Positives: ", counter, "\n")
    print("Ratio of True Positives: ", round(counter/len(checked_tweets),3), "\n")
    return counter
    
def show_tn():
    counter = 0
    #predictions = get_predictions()
    #tweets = checked_tweets[0].tolist()
    for i in range(len(checked_tweets)):
        if str(predictions[i]) == 'None' and checked_tweets[1][i] == 0:
            #print("True Negative:", tweets[i], "\n")
            counter += 1
    print("Number of True Negatives: ", counter, "\n")
    print("Ratio of True Negatives: ", round(counter/len(checked_tweets),3), "\n")
    return counter
    
def show_fn():
    #predictions = get_predictions()
    #tweets = checked_tweets[0].tolist()
    counter = 0
    for i in range(len(checked_tweets)):
        if str(predictions[i]) == 'None' and checked_tweets[1][i] == 1:
            #print("False Negative:", tweets[i], "\n")
            counter += 1
    print("Number of False Negatives: ", counter, "\n")
    print("Ratio of False Negatives: ", round(counter/len(checked_tweets),3), "\n")
    return counter
    
def show_fp():
    #predictions = get_predictions()
    #tweets = checked_tweets[0].tolist()
    counter = 0
    for i in range(len(checked_tweets)):
        if str(predictions[i]) != 'None' and checked_tweets[1][i] == 0:
            counter += 1
            #print("False Positive:")
            #doc = nlp(str(tweets[i]))
            #if doc.ents:
                #displacy.render(doc,style = "ent")
    print("Number of False Positives: ", counter, "\n")
    print("Ratio of False Positives: ", round(counter/len(checked_tweets),3), "\n")
    return counter

#test_data = pd.read_csv(TEST_DATA_PATH)
test_data = pd.read_csv(PREDICTED_DATA_PATH)

#y = test_data['food'].tolist()
nlp = spacy.load(MODEL_PATH)
#nlp = spacy.blank("en")
#print(nlp.pipe_names)
#ent_recognize("My rice cakes is tasty")
#ent_recognize(" peanut butter and jelly is the classic for me stuff that includes fruit sugars (like apples) are a good choice too imo")


# # Checking for overlapping words
   
#     print(data)
# keywords = foodKeeperInfo()
# testdata = convertToTrainingFormat("My rice cakes is tasty ", keywords)

# # for word in keywords:
# #     for word2 in wordsInFoodkeeper:
# #         if word in word2 and word != word2:
# #             print(word,word2)



# ## Use the function below to check individual sentences
# nlp = spacy.load("en_core_web_sm")
# ent_recognize("my friend is chicken because he is scared")
# print(live_tweets)
# testTweets = live_tweets[5]
# for tweet in testTweets[:500]:
#     if nlp(preProcess(tweet)).ents:
#         ent_recognize(preProcess(tweet))


print(predictions)
# ## Use the function below to check model performance on the entire test set
eval_model()

# ## Use the functions below to see TP, TN, FP, FN respectively
tp = show_tp()
tn = show_tn()
fn = show_fn()
fp = show_fp()

print("Precision: ", tp/(tp+fp))
print("Recall: ", tp/(tp+fn))

# foodkeeper = foodKeeperInfo()
# print(foodkeeper)
# sortedKeywords =  sorted(keywordRanker, key=keywordRanker.get, reverse=True)

# for i in range(15): #sortedKeywords
#     keywords.append(sortedKeywords[i])
# print(keywords)
# keywords.append("chicken")
# #keywords.append("cream cheese")

# abc = convertToTrainingFormat("I like to eat cream and cheese with chicken test", keywords)
# print(abc)

# # See what keywords are found by the created model
# keywordsFound = []
# nlp = spacy.load(MODEL_PATH)
# print(nlp.pipeline)
# for tweet in live_tweets[5]: #test_data['tweet']:
#     modeledTweet = nlp(preProcess(tweet))
#     for token in modeledTweet.doc.ents:
#         if str(token) in keywordsFound: continue
#         keywordsFound.append(str(token))


# # If the model finds food keywords print the Tweet
# To help visualize which keywords are being found this loop iterates through the Tweets testing the model to see what keywords it finds. If it finds a keyword in the Tweet it will print the Tweet and highlight the keyword

# for tweet in test_data['tweet'][:500]:
#     ents = nlp(preProcess(tweet))
#     #if ents.doc.ents:
#     ent_recognize(preProcess(tweet))



['chicken', 'milk', 'None', 'None', 'None', 'eggs', 'potatoes', 'None', 'None', 'None', 'None', 'None', 'salsa', 'salsa', 'None', 'None', 'ice cream', 'None', 'None', 'None', 'None', 'None', 'crab meat', 'butter', 'crab legs', 'crab legs', 'shellfish', 'quail', 'None', 'None', 'None', 'None', 'pastrami', 'pastrami', 'parsley', 'Some', 'None', 'None', 'None', 'chocolate', 'None', '@Sabbyaz', 'chocolate', 'None', 'None', 'None', 'buttermilk', 'popcorn', 'peas', 'None', 'None', 'None', 'pork', 'None', 'None', 'string cheese', 'cheese', 'None', 'None', 'chocolate', 'cheese', 'None', 'cheese', 'None', 'rabbit', 'None', 'None', 'None', 'pizza', 'Pizza', 'Pizza', 'pizza', 'pizza', 'rice', 'None', 'None', 'None', 'None', 'None', 'None', 'honey', 'None', 'None']
Ratio of incorrect predictions:  0.133 

Count Predicted Word      Accuracy  
---------------------------
1     chicken             Correct   
2     milk                Correct   
3     None                Correct   
4     None         