# Libraries

**Machine Learning**

In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
from sklearn.metrics.pairwise import cosine_similarity

**Word2Vec**

In [2]:
from nltk.tokenize import word_tokenize
import gensim 
from gensim.models import Word2Vec
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hongphuc95/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Visualization**

In [3]:
import folium
import html

In [4]:
data_path = "/home/hongphuc95/notebookteam/dataset/"

In [5]:
api_path = "/home/hongphuc95/notebookteam/api/"

# 1. Load 

**Data**

In [6]:
business_df = pd.read_json(data_path + "business.json", lines=True)

In [7]:
review_df = pd.read_json(data_path + "cleaned/review_cleaned_2016_2019_Vegas.json", lines=True)

In [8]:
review_by_business = review_df.groupby('business_id')['text'].agg(lambda col: ' '.join(col)).reset_index()

In [9]:
review_by_business.shape

(25280, 2)

In [10]:
all_stopwords = stopwords.words('english')
def clean_text(text):
    text_tokens = word_tokenize(text.lower())
    tokens_without_sw = [word for word in text_tokens if not word in all_stopwords]
    return tokens_without_sw

In [11]:
review_by_business["text"] = review_by_business["text"].apply(lambda x: clean_text(x))

**Model**

In [6]:
with open("review_tokenized.pickle", "rb") as f:
    review_by_business = pickle.load(f)

In [7]:
model = Word2Vec.load("review_full.model")

In [8]:
review_by_business.head(5)

Unnamed: 0,business_id,text
0,--1UhMGODdWsrMastO9DZw,"[last, review, mention, get, charge, extra, ,,..."
1,--6MefnULPED_I942VcFNA,"[decent, food, decent, price, ., standard, chi..."
2,--7zmmkVg-IMGaXbuVd0SQ,"[recent, tour, lake, norman, area, brewery, ,,..."
3,--8LPVSo5i0Oo61X01sV9A,"[dr., purcell, good, thorough, ., office, staf..."
4,--9QQLMTbFzLJ_oT-ON3Xw,"[ever, believe, check, time, ., always, 20, mi..."


In [12]:
#business_df.head(5)

## Some others useful functions

In [16]:
def business_details(business_df, review_df):
    business_df = business_df[["business_id", "name", "categories", "address", "city", "state", "latitude", "longitude", "stars", "review_count"]]
    new_df = pd.merge(business_df, review_df, how="inner", on="business_id")
    return new_df

In [17]:
def showInMap(df):
    
    mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

    for _, r in df.iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(
                        r["name"]) + '<br>' + 
                        'Stars: ' + str(r.stars) + '<br>' + 
                        'Reviews: ' + str(r.review_count) + '<br>' + 
                        'Categories: ' + str(r.categories) + '<br>',    
                    icon = folium.Icon(color='green')).add_to(mp)
    return mp

# 2. Engine

In [18]:
#def get_vect(word, model):
#    try:
#        return model.wv[word]
#    except KeyError:
#        return [np.zeros((model.vector_size,))]

In [9]:
def avg_feature_vector(sentence, model, n_features):
    index2word_set = set(model.wv.index2word)
    feature_vec = np.zeros((n_features, ), dtype='float32')
    n_words = 0
    for word in sentence:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [10]:
review_by_business["text_vec"] = review_by_business["text"].apply(lambda x: avg_feature_vector(x, model=model, n_features=200))

In [11]:
review_by_business.head(5)

Unnamed: 0,business_id,text,text_vec
0,--1UhMGODdWsrMastO9DZw,"[last, review, mention, get, charge, extra, ,,...","[-0.6699545, 0.07671176, 0.6018739, -0.1426049..."
1,--6MefnULPED_I942VcFNA,"[decent, food, decent, price, ., standard, chi...","[-0.70146334, 0.04727568, 0.6008351, -0.009889..."
2,--7zmmkVg-IMGaXbuVd0SQ,"[recent, tour, lake, norman, area, brewery, ,,...","[-0.475734, 0.54039925, 0.31210145, 0.02571510..."
3,--8LPVSo5i0Oo61X01sV9A,"[dr., purcell, good, thorough, ., office, staf...","[0.62700117, 0.45426702, 0.26523137, -0.732726..."
4,--9QQLMTbFzLJ_oT-ON3Xw,"[ever, believe, check, time, ., always, 20, mi...","[0.36767393, 0.47562706, 0.7197984, -0.4986300..."


In [13]:
review_by_business.drop("text", axis=1, inplace=True)

In [14]:
#with open(data_path + "cleaned/review_2016_2019_veconly_vectorized.pickle", "wb") as f:
#    pickle.dump(review_by_business, f, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
model.wv.vectors.shape

(67716, 200)

In [8]:
model.wv.most_similar("vietnamese")

[('chinese', 0.7128885388374329),
 ('taiwanese', 0.7126409411430359),
 ('cantonese', 0.7020187377929688),
 ('viet', 0.687890887260437),
 ('filipino', 0.662496030330658),
 ('asian', 0.6172739267349243),
 ('korean', 0.6125739812850952),
 ('cuban', 0.6119341850280762),
 ('ethiopian', 0.6098645925521851),
 ('colombian', 0.6047247648239136)]

In [8]:
def keyword_recommend(input_str, docvecs, model, top_n):
    business_similarity = pd.DataFrame()
    
    from nltk.tokenize import word_tokenize
    input_vec = pd.DataFrame({"text" : [input_str]})
    input_vec["text"] = input_vec["text"].apply(lambda x: word_tokenize(x.lower()))
    input_vec["text_vec"] = input_vec["text"].apply(lambda x: avg_feature_vector(x, model=model, n_features=200))
    business_similarity = docvecs[["business_id", "text_vec"]]
    
    #compute similarity array
    business_similarity["score"] = business_similarity["text_vec"].apply(lambda x: cosine_similarity([x], [input_vec["text_vec"].values[0]])[0][0])
    business_similarity = business_similarity.drop(columns=["text_vec"]).sort_values(ascending=False, by="score").head(top_n)
    
    return business_similarity

In [34]:
c = keyword_recommend("chicken cheese", review_by_business, model, 10)

In [35]:
d = business_details(business_df, c)

In [36]:
d[["name", "categories", "city", "score"]]

Unnamed: 0,name,categories,city,score
0,Arby's,"Sandwiches, Fast Food, Burgers, Restaurants",Amherst,0.691516
1,Swiss Chalet Rotisserie & Grill,"American (Traditional), Restaurants, Barbeque,...",Mississauga,0.744961
2,KFC,"Chicken Wings, Fast Food, Chicken Shop, Restau...",South Euclid,0.70279
3,Seaboat,"Soul Food, Restaurants, Southern",Champaign,0.712854
4,Superpumper,"Gas Stations, Automotive",Phoenix,0.691757
5,Nana's Soul Food Kitchen,"Breakfast & Brunch, Restaurants, Cajun/Creole,...",Matthews,0.690423
6,Ollie's Pizza,"Restaurants, Pizza",Canonsburg,0.758181
7,The Family Fry Guy,"Street Vendors, Food",Calgary,0.706818
8,Ming Moon,"Restaurants, Chinese",Cleveland,0.709589
9,Domino's Pizza,"Pizza, Chicken Wings, Restaurants, Sandwiches",Kannapolis,0.7199


In [108]:
showInMap(d)

In [40]:
def avg_feature_vector(sentence, model, n_features):
    index2word_set = set(model.wv.index2word)
    feature_vec = np.zeros((n_features, ), dtype='float32')
    n_words = 0
    for word in sentence:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
def business_similarity(business_ids, docvecs, top_n):
    businesses_similarity = pd.DataFrame()
    for business_id in business_ids:
        business_similarity = pd.DataFrame()
        input_vec = docvecs[docvecs["business_id"] == business_id]
        business_similarity = docvecs[["business_id", "text_vec"]]

        #compute similarity array
        business_similarity["score"] = business_similarity["text_vec"].apply(lambda x: cosine_similarity([x] , [input_vec["text_vec"].values[0]])[0][0])
        business_similarity = business_similarity.drop(columns=["text_vec"]).sort_values(ascending=False, by="score")
        
        #Filter business with the same id
        business_similarity = business_similarity[business_similarity["business_id"] != business_id].head(top_n)
        business_similarity["input_business_id"] = business_id
        businesses_similarity= pd.concat([businesses_similarity, business_similarity])
        
    return businesses_similarity

In [42]:
a = business_similarity(["-0BxAGlIk5DJAGVkpqBXxg", "--7zmmkVg-IMGaXbuVd0SQ"], review_by_business, 10)

In [88]:
b = business_details(business_df, a)

In [100]:
showInMap(b)