# Food Recommender (based off Burpple Reviews)
### By: Team Omnomnom 

#### Annabella Lee, Bernice Tan, Fernanda Tan, Rachel Khong

## Contents
+ Part 1: Web Scraping
+ Part 2: Data Preparation
    + Cleaning
    + Rating
+ Part 3: Telegram Bot

## Part 1: Web Scraping

### Load the packages

In [None]:
from datetime import datetime
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd

### Chrome Setup

In [None]:
chrome_options = webdriver.ChromeOptions()
adblock = r'C:\Users\user\AppData\Local\Google\Chrome\User Data\Profile 2\Extensions\gighmmpiobklfepjocnamgkkbiglidom\4.15.0_0'
chrome_options.add_argument("load-extension=" + adblock)

chromedriver = r'C:\Users\user\Documents\Scraping\chromedriver.exe'

driver = webdriver.Chrome(executable_path=chromedriver,options=chrome_options)

### Get list of places (the urls) from a neighbourhood

In [None]:
driver.get(r'https://www.burpple.com/search/sg?q=Bugis')

In [None]:
# load more until no more
print(datetime.now())
while True:
    try:
        load_more = WebDriverWait(driver, 10).until(
              EC.element_to_be_clickable((By.ID, "masonryViewMore-link")))
        actions = ActionChains(driver)
        actions.move_to_element(load_more).click(load_more).perform()
        sleep(2)
    except Exception as e:
        print(e)
        break
    
print(datetime.now())

In [None]:
link = []
link_element = driver.find_elements_by_css_selector("div[class='searchVenue-header card-item card-item--header']>a")
for i in link_element:
    link.append(i.get_attribute("href"))

In [None]:
len(link)

In [None]:
lk = pd.DataFrame(link)
lk.to_csv("links.csv", index=False)

### Scraping for the Dataset

In [None]:
reviews = []
places = []

In [None]:
for url in link:
    driver.get(url)
    actions = ActionChains(driver)
    
    while True:
        try:
            load_more = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "load-more-reviews")))
            actions.move_to_element(load_more).click().perform()
        except TimeoutException:
            break
    
    sleep(5)
    
    place = driver.find_element_by_css_selector("h1[class='venue-title']>a").text
    
    address = driver.find_element_by_css_selector("div[class='venue-details__item-body']>p").text
    address = address.replace("\n"," ")
    
    venue_tag = []
    venue_tags = driver.find_elements_by_css_selector("a[class='venue-tag']")
    for tag in venue_tags:
        venue_tag.append(tag.text)
        
    url = driver.current_url
    
    places.append((place, address, venue_tag, url))
    
    cards = driver.find_elements_by_css_selector("div[class='food card feed-item']")
    for card in cards:
        user_url = card.find_element_by_css_selector("div[class='card-item-set--link-title']>a").get_attribute("href")
        username = user_url.replace("https://www.burpple.com/@","")
        
        try:
            review = card.find_element_by_css_selector("div[class='food-description-body']>p").text
        except NoSuchElementException:
            review = "None"
        
        reviews.append((place, username, review))
    
    sleep(3)

In [None]:
driver.quit()

In [None]:
rv = pd.DataFrame(reviews)
rv.to_csv("reviews_bugis.csv", header = ['Place', 'Username', 'Review'], index=False)

pl = pd.DataFrame(places)
pl.to_csv("places_bugis.csv", header = ['Place', 'Address', 'Tags', 'url'], index=False)

## Part 2: Data Preparation

### Load the packages

In [1]:
import re
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from snownlp import SnowNLP
from nltk import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
pl = pd.read_csv("places-bugis.csv")
rv = pd.read_csv("reviews-bugis.csv")

### Data Cleaning

In [3]:
# Remove duplicates
pl.drop_duplicates(keep="first", inplace = True)
rv.drop_duplicates(keep="first", inplace = True)

# Remove rows with no review (no body text)
rv = rv[rv.Review != 'None']

# Remove rows with review with no alphabets
rv =  rv[rv['Review'].str.contains('[A-Za-z]')]

In [4]:
print(len(rv),"reviews to to work on.")

15071 reviews to to work on.


### Sentiment Analysis

In [6]:
# Get only the polarity of sentiment. Change the score to be upon 5
rv['rating-textblob'] = rv['Review'].apply(lambda Review: (TextBlob(Review).sentiment.polarity+1)*2.5)

In [7]:
# Get only the compound score. Change the score to be upon 5
rv['rating-vader'] = rv['Review'].apply(lambda Review: (SentimentIntensityAnalyzer().polarity_scores(Review)['compound']+1)*2.5)

#### Some examples on the sentiment scoring differences between TextBlob and VADER 

In [8]:
n = 0
print(rv.iloc[n,2], "\nRating (TextBlob):", rv.iloc[n,3], "\nRating (VADER):", rv.iloc[n,4])

Food was good and delicious with generous portions. If anyone who planned to used Burpple, please ask the staff before you order. Because I failed to use it although that day was not the exclusion day listed by the Burpple but told that the restaurant was having in house event. Kinda disappointed, I did not enjoy anything from the event also ☹️. So, don't want to say much about it. 
Rating (TextBlob): 2.6875 
Rating (VADER): 0.6637499999999998


**Expected Output:**<br>
Food was good and delicious with generous portions. If anyone who planned to used Burpple, please ask the staff before you order. Because I failed to use it although that day was not the exclusion day listed by the Burpple but told that the restaurant was having in house event. Kinda disappointed, I did not enjoy anything from the event also ☹️. So, don't want to say much about it. <br>
Rating (TextBlob): 2.6875 <br>
Rating (VADER): 0.6637499999999998<br>
<br>
**Comment:** Reviewer complimented the food, but in only one sentence. In comparison, his complaint was much longer which VADER took into consideration for the score.

In [9]:
n = 38
print(rv.iloc[n,2], "\nRating (TextBlob):", rv.iloc[n,3], "\nRating (VADER):", rv.iloc[n,4])

Bourbon Pecan Pie-The Winner! Seriously, sooo good 4.5/5 
Rating (TextBlob): 2.9583333333333335 
Rating (VADER): 4.356


**Expected Output:**<br>
Bourbon Pecan Pie-The Winner! Seriously, sooo good 4.5/5 <br>
Rating (TextBlob): 2.9583333333333335 <br>
Rating (VADER): 4.356<br>
<br>
**Comment:** The rating by VADER is actually pretty close to what user had explicitly rated in his review. Either VADER is good or reviewer is good at expressing what he feels in words. Most possibly both!

In [10]:
n = 4646
print(rv.iloc[n-1,2], "\nRating (TextBlob):", rv.iloc[n-1,3], "\nRating (VADER):", rv.iloc[n-1,4])

Ordered a Rosy Nose ($9.80), Salmon & Dill Quiche ($7), Salted Caramel Cheesecake ($7) and Galaxy Rainbow Cheesecake ($8). 
Rating (TextBlob): 3.75 
Rating (VADER): 2.5


**Expected Output:**<br>
Ordered a Rosy Nose (\\$9.80), Salmon & Dill Quiche (\\$7), Salted Caramel Cheesecake (\\$7) and Galaxy Rainbow Cheesecake (\\$8). <br>
Rating (TextBlob): 3.75 <br>
Rating (VADER): 2.5<br>
<br>
**Comment:** Purely stating what food they ordered and the price of it gives a neutral rating.

#### Final bits to put together

In [12]:
# Drop the ratings by TextBlob, rename the one by VADER as the one we want to use
rv.drop(columns='rating-textblob', inplace=True)
rv.rename(columns={'rating-vader':'Rating'}, inplace=True)

# Getting the mean rating of each place
temp = rv.groupby(['Place'])['Rating'].mean().to_frame()
pl = pd.merge(pl, temp, on="Place")

# Round Rating to 2 d.p.
pl['Rating'] = pl['Rating'].round(decimals=2)

# Remove the [''] in the Tags column e.g. ['Desserts'] to Desserts
pl['Tags'] = pl['Tags'].str.replace(r"[", "")
pl['Tags'] = pl['Tags'].str.replace(r"]", "")
pl['Tags'] = pl['Tags'].str.replace(r"'", "")

# Tokenize tags to new dataframe
pl_tokens = pd.DataFrame()
pl_tokens['tags_tokenize'] = pl['Tags'].str.lower()
pl_tokens['tags_tokenize'] = pl_tokens['tags_tokenize'].str.replace(r",", "")
pl_tokens['tags_tokenize'] = pl_tokens['tags_tokenize'].apply(word_tokenize)

In [13]:
pl.to_csv("places-bugis-ratings.csv", index=False)
pl_tokens.to_csv("places-bugis-tags-tokens.csv", index=False)

In [14]:
# all the tags available
all_tags = set()
pl['Tags'].str.lower().str.split(', ').apply(all_tags.update)
print(all_tags)
print("Total:",len(all_tags),"tags")

{'', 'fine dining', 'pasta', 'mexican', 'cheap & good', 'buffets', 'french', 'dinner with drinks', 'breakfast & brunch', 'bubble tea', 'ice cream & yoghurt', 'chinese', 'indonesian', 'fast food', 'cafes & coffee', 'newly opened', 'korean', 'korean desserts', 'ramen', 'chicken rice', 'japanese', 'mediterranean', 'waffles', 'supper', 'burgers', 'bread', 'takeaway available', 'turkish', 'zi char', 'bak kut teh', 'desserts', 'taiwanese', 'korean bbq', 'hidden gem', 'rainy day comforts', 'chirashi', 'cakes', 'italian', 'fruit tea', 'peranakan', 'salads', 'pizza', 'vegetarian', 'western', 'malay', 'sandwiches', 'soup', 'thai', 'late night', 'nasi lemak', 'middle eastern', 'european', 'burpple guides', 'steamboat', 'bars', 'dim sum', 'steak', 'healthier choice', '1-for-1 deals', 'halal', 'indian', 'seafood', 'kopitiam', 'sushi', 'date night', 'spanish', 'vietnamese', 'healthy', 'hawker food', 'korean fried chicken', 'local delights', 'great view', 'interesting', 'bbq', 'craft beer', 'cocktail

## Part 3: Telegram Bot (@nom_what_bot)

In [None]:
import pandas as pd
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import time
import logging
from telegram import ReplyKeyboardMarkup
from telegram.ext import (Updater, CommandHandler, MessageHandler, Filters,
                          ConversationHandler)

df_places = pd.read_csv("places-bugis-ratings.csv")
tag_tokens = pd.read_csv("places-bugis-tags-tokens.csv")

# Enable logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)

logger = logging.getLogger(__name__)

START_ANOT, RECOMMEND, END_ANOT = range(3)

yes_no_keyboard = [['No', 'Yes']]
yes_no_markup = ReplyKeyboardMarkup(yes_no_keyboard, one_time_keyboard=True)

more_done_keyboard = [['No, not yet! (refine further)', 'Yeah, done!']]
more_done_markup = ReplyKeyboardMarkup(more_done_keyboard, one_time_keyboard=True)

def counts(row, tokens_filtered):
    row_string = row.values[0]
    cnt = 0
    for x in tokens_filtered:
        cnt = cnt + row_string.count(x)
    return cnt

def start(update, context):    
    user = update.message.from_user # Get user's name
    # Send a greeting
    context.bot.send_message(chat_id=update.effective_chat.id, text="Omnomnom 🍪 "+user.first_name)
    update.message.reply_text(
        "Need help deciding where to eat?",
        reply_markup=yes_no_markup)
    return START_ANOT

def yes_start(update, context):
    update.message.reply_text('Alright! Whatchu feeling? \nKeyword Examples: Pasta / Sushi / Dim Sim / Steak / etc. ')
    return RECOMMEND

def no_start(update, context):
    update.message.reply_text("Okay that's fine. Use /start when you need help again :)")
    return ConversationHandler.END
    
def recommend(update, context):
    text = update.message.text
    
    tokens = word_tokenize(text.lower())
    ps = PorterStemmer()
    tokens_filtered = [ps.stem(x) for x in tokens if x not in stopwords.words('english') and bool(re.search("[-0123456789`>(</',;:!?.)&]", x))==False]
    
    df_places['match_counts'] = tag_tokens.apply(lambda row: counts(row, tokens_filtered), axis=1)
    df_places.sort_values(by = ['match_counts', 'Rating'], ascending=False, inplace=True)
    df_places.reset_index(drop = True, inplace=True)
    
    time.sleep(1)
    # Place, Rating, Location, Tags, url
    context.bot.send_message(
        chat_id=update.effective_chat.id,
        disable_web_page_preview=True,
        text=
        "1. "+df_places.iloc[0][0]+"\nRating: "+str(df_places.iloc[0][4])+"\n"+df_places.iloc[0][1]+"\nRelated Tags: "+df_places.iloc[0][2]+"\n"+df_places.iloc[0][3]+"\n"+"\n"+
        "2. "+df_places.iloc[1][0]+"\nRating: "+str(df_places.iloc[1][4])+"\n"+df_places.iloc[1][1]+"\nRelated Tags: "+df_places.iloc[1][2]+"\n"+df_places.iloc[1][3]+"\n"+"\n"+
        "3. "+df_places.iloc[2][0]+"\nRating: "+str(df_places.iloc[2][4])+"\n"+df_places.iloc[2][1]+"\nRelated Tags: "+df_places.iloc[2][2]+"\n"+df_places.iloc[2][3]+"\n"+"\n"+
        "4. "+df_places.iloc[3][0]+"\nRating: "+str(df_places.iloc[3][4])+"\n"+df_places.iloc[3][1]+"\nRelated Tags: "+df_places.iloc[3][2]+"\n"+df_places.iloc[3][3]+"\n"+"\n"+
        "5. "+df_places.iloc[4][0]+"\nRating: "+str(df_places.iloc[4][4])+"\n"+df_places.iloc[4][1]+"\nRelated Tags: "+df_places.iloc[4][2]+"\n"+df_places.iloc[4][3]
    )
    
    time.sleep(1)
    update.message.reply_text(
        'Done making up your mind on where to eat already? Or refine your recommendations further?',
        reply_markup=more_done_markup)
    return END_ANOT

def more_keyword(update, context):
    update.message.reply_text("Okay more! Tell me more keywords to refine your recommendations.")
    return RECOMMEND

def done(update, context):
    update.message.reply_text("Nice to have helped! Use /start when you need help again :)")
    return ConversationHandler.END

def main():
    # Create the Updater and pass it your bot's token.
    # Make sure to set use_context=True to use the new context based callbacks
    # Post version 12 this will no longer be necessary
    updater = Updater(telegram-bot-token, use_context=True)

    # Get the dispatcher to register handlers
    dp = updater.dispatcher

    # Add conversation handler with the states CHOOSING, TYPING_CHOICE and TYPING_REPLY
    conv_handler = ConversationHandler(
        entry_points=[CommandHandler('start', start)],

        states={
            START_ANOT: [MessageHandler(Filters.regex(re.compile('(yes|yas|yea|yeah|ye)', re.IGNORECASE)),
                                        yes_start),
                        MessageHandler(Filters.regex(re.compile('(no|nah)', re.IGNORECASE)),
                                       no_start)
                        ],
            RECOMMEND: [MessageHandler(Filters.text,
                                       recommend)
                       ],
            END_ANOT: [MessageHandler(Filters.regex(re.compile('^(No, not yet! (refine further)|more)$', re.IGNORECASE)),
                                      more_keyword),
                       MessageHandler(Filters.regex(re.compile('^(Yeah, done!|done)$', re.IGNORECASE)),
                                      done),
                      ],
        },

        fallbacks=[MessageHandler(Filters.regex('^Done$'), done)]
    )

    dp.add_handler(conv_handler)

    # Start the Bot
    updater.start_polling()

    # Run the bot until you press Ctrl-C or the process receives SIGINT,
    # SIGTERM or SIGABRT. This should be used most of the time, since
    # start_polling() is non-blocking and will stop the bot gracefully.
    updater.idle()

if __name__ == '__main__':
    main()

2020-07-22 06:33:22,772 - telegram.ext.updater - INFO - Received signal 2 (SIGINT), stopping...


In [1]:
# for testing
import pandas as pd
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
df_places = pd.read_csv("places-bugis-ratings.csv")
tag_tokens = pd.read_csv("places-bugis-tags-tokens.csv")

def counts(row, tokens_filtered):
    row_string = row.values[0]
    cnt = 0
    for x in tokens_filtered:
        cnt = cnt + row_string.count(x)
    return cnt

def recommend():
    text="some hot soup for the rainy day?" # user input test
    
    tokens = word_tokenize(text.lower())
    ps = PorterStemmer()
    tokens_filtered = [ps.stem(x) for x in tokens if x not in stopwords.words('english') and bool(re.search("[-0123456789`>(</',;:!?.&)]", x))==False]
    
    df_places['match_counts'] = tag_tokens.apply(lambda row: counts(row, tokens_filtered), axis=1)
    df_places.sort_values(by = ['match_counts', 'Rating'], ascending=False, inplace=True)
    df_places.reset_index(drop = True, inplace=True)
    one = df_places.iloc[0][0]

recommend()
df_places.head()

Unnamed: 0,Place,Address,Tags,url,Rating,match_counts
0,Blanco Court Fried Fish Noodles,325 Beach Road Singapore 199559,"Rainy Day Comforts, Soup, Hawker Food",https://www.burpple.com/blanco-court-fish-soup,3.75,2
1,Angel Horse Teochew Fish Soup (Albert Centre M...,270 Queen Street #01-95 Albert Centre Market &...,"Rainy Day Comforts, Hawker Food",https://www.burpple.com/angel-horse-teochew-fi...,4.79,1
2,Xun Wei Hotpot,28 Liang Seah Street Singapore 189049,"Rainy Day Comforts, Buffets, Supper, Chinese, ...",https://www.burpple.com/xun-wei-hotpot,4.63,1
3,Beauty Nutritious Soup (Bugis Junction),200 Victoria Street #03-30 Bugis Junction Food...,"Soup, Hawker Food",https://www.burpple.com/beauty-nutritious-soup...,4.41,1
4,Kuan Kuan Spicy Hotpot & Nourishing Soup,32 Liang Seah Street Singapore 189053,"Rainy Day Comforts, Chinese",https://www.burpple.com/kuan-kuan-spicy-hotpot,4.41,1
