In [31]:
from ipywidgets import widgets, interact

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import warnings
warnings.filterwarnings('ignore')

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords
#nltk.download('averaged_perceptron_tagger')

from rake_nltk import rake

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from wordcloud import WordCloud, ImageColorGenerator

import requests
import json
import time

# User Input

First, we'll have the user input select the type of destination they'd like recommendations.

In [32]:
# define destination options
destination_options = ['Coffee Shop', 'Boutique Shop', 'Nightlife']

# Create text widget for output
destination_category = widgets.Dropdown(options = destination_options)
print("Please select where you'd like to go.")
destination_category

Please select where you'd like to go.


Dropdown(options=('Coffee Shop', 'Boutique Shop', 'Nightlife'), value='Coffee Shop')

Next, we'll request that our user input the Yelp URL of one of their favorite local businesses which falls under the umbrella of the previously selected category. 

In [33]:
# request url to input Yelp URL
user_url = input("Please input the Yelp URL of your favorite local business relating to your selected destination category.")

Please input the Yelp URL of your favorite local business relating to your selected destination category.https://www.yelp.com/biz/butler-brooklyn-2?osq=coffee


# Scrape Review Data

First we'll post a request to have the review data scraped.

In [34]:
request_url = "https://app.datashake.com/api/v2/profiles/add"

# url of yelp page to scrape
querystring = {"url": user_url,
               "blocks": 100}

# provide api key
headers = {
    'spiderman-token': "a977b8454ca8cc4324dda1aaf0742d09ff8e72ec",
}

# request and print response
response = requests.request("POST", request_url, headers=headers, params=querystring)
print(response.text)
    
# obtain job_id and append to list
job_id = json.loads(response.text)['job_id']

# initialize 

{"success":true,"job_id":126872120,"status":200,"message":"Added this profile to the queue..."}


This next cell will continuously check the status of our scrape request until fulfilled. 

In [35]:
# url to check status
url = "https://app.datashake.com/api/v2/profiles/info"

# specify job_id to check status of
querystring = {"job_id":job_id}

# specify token
payload = ""
headers = {
    'spiderman-token': "a977b8454ca8cc4324dda1aaf0742d09ff8e72ec",
    }

# use while loop to continue checking status of scrape until 100% complete

# initialize percentage complete at zero
percentage_complete = 0

while percentage_complete < 100:
    # check web scraper status
    response = requests.request("GET", url, data=payload, headers=headers, params=querystring)
    # check percentage complete
    percentage_complete = json.loads(response.text)['percentage_complete']
    # wait 15 seconds before checking again
    time.sleep(15)

Next, we'll have that data retrieved and compile it into a dataframe. 

In [36]:
user_url_reviews_df = pd.DataFrame(columns = ['business_name','source_url','review_ratings', 'review_text'])

In [37]:
# iterate through first five pages of Yelp review data
for page in [1,2,3,4,5]:    
    # specify url
    url = "https://app.datashake.com/api/v2/profiles/reviews"
        
    # specifiy pages to scrape
    querystring = {"job_id": job_id,
                   "page": page}
    
    # define API token
    headers = {
        'spiderman-token': "a977b8454ca8cc4324dda1aaf0742d09ff8e72ec",
        }
        
    # post request for reviews
    response_reviews = requests.request("GET", url, headers=headers, params=querystring)
        
    # convert response to json object
    json_response = json.loads(response_reviews.text)

    # grab source url
    source_url = json_response['source_url']

    # grab business name
    business_name = json.loads(json_response['meta_data'])['name']

    # grab review rating
    review_ratings = []
    for review in json_response['reviews']:
        review_ratings.append(review['rating_value'])

    # grab review text data
    review_texts = []
    for review in json_response['reviews']:
        review_texts.append(review['review_text'])

    # append review data to dataframe
    user_url_reviews_df = user_url_reviews_df.append(pd.DataFrame({'business_name': business_name,
                                                                   'source_url': source_url,
                                                                   'review_ratings': [rating for rating in review_ratings],
                                                                   'review_text': [text for text in review_texts]}).reset_index(drop=True))

In [38]:
# reset index for cleanliness
user_url_reviews_df = user_url_reviews_df.reset_index(drop=True)

# Clean New Review Data

## Remove Reviews with Less Than Four Stars

In [39]:
# filter out all reviews with less than four stars
user_url_reviews_df = user_url_reviews_df[user_url_reviews_df['review_ratings'].isin([4.0,5.0])]

## Expand Contractions

In [40]:
# remove "&#39;" which is sometimes used in place of apostrophe in contractions
user_url_reviews_df['review_text'] = user_url_reviews_df['review_text'].apply(lambda x: str(x).replace("&#39;", "'"))

In [41]:
# make all reviews lowercase
user_url_reviews_df['review_text'] = user_url_reviews_df['review_text'].apply(lambda x: x.lower())

In [42]:
# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will", "it's": "it is",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

In [43]:
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews
user_url_reviews_df['review_text']=user_url_reviews_df['review_text'].apply(expand_contractions)

## Tokenize Review Data

In [44]:
# define tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# define stop words
stop_words = set(stopwords.words('english'))

def tokenize_review(review):
    # replace <br> breaks with spaces
    review = review.replace("<br>", ' ')
    # make all characters lowercase
    review = review.lower()
    # create tokens, use regex to remove all punctuation
    tokens = tokenizer.tokenize(review)
    # remove stopwords
    clean_tokens = [token for token in tokens if not token in stop_words]
    return clean_tokens

In [45]:
# apply function to dataframes
user_url_reviews_df['review_tokens'] =  user_url_reviews_df['review_text'].apply(tokenize_review)

## Create Bag of Words

In [46]:
# write function to create bag-of-words from list
def bag_of_words(list):
    words = ''
    for word in list:
        words += word + ' '
    return words 

In [47]:
user_url_reviews_df['bag_of_words'] = user_url_reviews_df['review_tokens'].apply(bag_of_words)

## Aggregate BOW Data

In [48]:
# aggreate all BOWs for each review into single BOW
user_url_reviews_grouped = user_url_reviews_df.groupby('business_name').agg({'bag_of_words': 'sum'})

In [49]:
user_url_reviews_grouped

Unnamed: 0_level_0,bag_of_words
business_name,Unnamed: 1_level_1
Butler,best coffee dumbo great spot quickly grab coff...


In [50]:
# define business name for future reference
user_selected_business = user_url_reviews_grouped.index[0]

## Apend Cleaned Dataframe to Appropriate Seattle Businesses BOW Dataframe

In [51]:
# import Seattle business BOW dataframes
seattle_coffee_reviews_grouped = pd.read_csv('seattle_coffee_reviews_grouped.csv') 
seattle_boutique_reviews_grouped = pd.read_csv('seattle_boutique_reviews_grouped.csv')  
seattle_adult_reviews_grouped = pd.read_csv('seattle_adult_reviews_grouped.csv') 

In [52]:
# set business_name as index
seattle_coffee_reviews_grouped.set_index('business_name', inplace=True)
seattle_boutique_reviews_grouped.set_index('business_name', inplace=True)
seattle_adult_reviews_grouped.set_index('business_name', inplace=True)

In [53]:
# define appropriate dataframe
def return_appropriate_dataframe(destination_category):
    if destination_category.value == 'Coffee Shop':
        df = pd.concat([seattle_coffee_reviews_grouped, user_url_reviews_grouped])
    elif destination_category.value == 'Boutique Shop':
        df = pd.concat([seattle_boutique_reviews_grouped, user_url_reviews_grouped])
    else:
        df = pd.concat([seattle_adult_reviews_grouped, user_url_reviews_grouped])
    return df

In [54]:
# create dataframe with user provided data apended to appropriate dataframe
df = return_appropriate_dataframe(destination_category)

# Generate Recommendations and Similarity Plots

In [55]:
# create numerical indices for future reference
df_indices = pd.Series(df.index)

In [56]:
# initialize 
df_tfidf = TfidfVectorizer()
df_tfidf__matrix = df_tfidf.fit_transform(df['bag_of_words'])

In [57]:
# generating the cosine similarity matrix
df_cosine_sim = cosine_similarity(df_tfidf__matrix, df_tfidf__matrix)
df_cosine_sim

array([[1.        , 0.31739343, 0.46874318, ..., 0.46699533, 0.49337816,
        0.37129738],
       [0.31739343, 1.        , 0.28896629, ..., 0.30740182, 0.32237263,
        0.2952029 ],
       [0.46874318, 0.28896629, 1.        , ..., 0.42238264, 0.45453209,
        0.32103873],
       ...,
       [0.46699533, 0.30740182, 0.42238264, ..., 1.        , 0.80334119,
        0.32840929],
       [0.49337816, 0.32237263, 0.45453209, ..., 0.80334119, 1.        ,
        0.35349163],
       [0.37129738, 0.2952029 , 0.32103873, ..., 0.32840929, 0.35349163,
        1.        ]])

In [58]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(business, cosine_sim, indices):
    
    # initialize empty list of recommended businesses
    recommended_businesses = []
    
    # obtain index of business which matches input
    idx = indices[indices == business].index[0]

    # creating series with the similarity scores in descending order, convert to list
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # create list of indices for top three businesses
    score_series_list = list(score_series.index[1:6])

    # add top three recommended businesses to list
    for idx in score_series_list:
        recommended_businesses.append(indices[idx])
        
    return recommended_businesses

In [59]:
recommendations(user_selected_business, df_cosine_sim, df_indices)

['Five Stones Coffee Company',
 'Down Pour Coffee Bar',
 'Sugar Bakery & Coffeehouse',
 'Capitol Coffee Works',
 'Street Bean Coffee Roasters']