# Libraries

In [1]:
import sys
pathModulesES = '../sauceforyall/'
sys.path.append(pathModulesES)

**Elasticsearch Query**

In [2]:
from yelpquery import YelpQuery
from pandasticsearch import Select
ye = YelpQuery()

**Machine Learning**

In [3]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

**Visualize**

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

**Others**

In [6]:
import re
import string

**Warning**

In [7]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

**Index name**

In [8]:
index_business = "yelp-business*"
index_review = "yelp-review*"
index_tip = "yelp-tip*"
index_user = "yelp-user*"

In [9]:
data_path = "/home/hongphuc95/notebookteam/dataset/"

# 1. Data Pre-Processing

## 1.1 Load data

Retrieve all the reviews from the last 3 years, this helps reduce the volume of data

In [10]:
df_review = pd.read_json(data_path + "cleaned/restaurant_review_cleaned_2018.json", lines=True)

## 1.2 Handle missing data

There are businesses that don't have any reviews in the particular year we're looking for

In [11]:
df_review.isna().any()

business_id       False
name              False
categories        False
city              False
state             False
business_stars    False
cool               True
useful             True
funny              True
stars              True
review_id          True
user_id            True
date               True
text               True
dtype: bool

In [12]:
df_review = df_review.dropna(subset=["review_id"])

In [13]:
df_review.shape

(712379, 14)

In [14]:
df_review.head()

Unnamed: 0,business_id,name,categories,city,state,business_stars,cool,useful,funny,stars,review_id,user_id,date,text
2,oITu5Qwnmv0hsEMc21XjXw,The Chef's Table at Quickhatch,[Restaurants],Pittsburgh,PA,4.0,0.0,0.0,0.0,5.0,AEbyPwGD6_A9i3R0lwqrUA,XL1D6UF2Bl3tmkcfqxgLJg,2018-02-18 13:27:12,"We be in love with Quickhatch ... the food, Ch..."
3,oITu5Qwnmv0hsEMc21XjXw,The Chef's Table at Quickhatch,[Restaurants],Pittsburgh,PA,4.0,0.0,1.0,0.0,5.0,ybqOQcrY5hM3zWo4lSduUA,WcrRl0U659RxXHkKANZSdQ,2018-01-20 22:20:38,Phenomenal dinner last night. The chef really ...
4,oITu5Qwnmv0hsEMc21XjXw,The Chef's Table at Quickhatch,[Restaurants],Pittsburgh,PA,4.0,0.0,0.0,0.0,5.0,rnarbYk8Nlbxfmrid78Jug,XmYUCXBf5LxSFppvNmfxFA,2018-01-15 16:54:34,Our group of eight have an outstanding multi-c...
5,oITu5Qwnmv0hsEMc21XjXw,The Chef's Table at Quickhatch,[Restaurants],Pittsburgh,PA,4.0,0.0,1.0,0.0,1.0,7AQcBhn6XQ7-M0QOme8K7g,xZknZsB-yxEQbI8JLollXw,2018-01-24 12:47:42,it be restaurant week. we read over the menu l...
6,C2wtzldZi02IwFSp8zgT0w,Kona Grill Corporate Office,[Restaurants],Scottsdale,AZ,2.0,1.0,2.0,0.0,4.0,5pk-1aFG1nF0-OsZqVkOeA,vwi68eg4bmJHMSYq3jP2OQ,2018-01-14 07:05:43,Had dinner with our friend from the west valle...


# 2. Clustering the review for all the restaurants

**Features**

In [15]:
comments = df_review["text"]

**Train/Test Split**

In [16]:
comment_train, comment_test = train_test_split(comments, test_size=0.3)

## 2.1 NLP Representation

In [17]:
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True, max_features = 5000,
                             ngram_range = (1, 1)
                            )

Train the model with the users comments vectorized

In [36]:
tfidf = vectorizer.fit(comment_train)

In [37]:
comment_train_vec = vectorizer.transform(comment_train)

Feature extraction, in layman term we try to get the vocabulary of your TF-IDF

In [38]:
words = vectorizer.get_feature_names()

Transform all the original reviews using the trained model

In [39]:
comment_vec = vectorizer.transform(comments)

## 2.2 Cluster reviews with K-Means

**Two classes guess**

We test our assumption that we can really split the behaviour of all the restaurant reviews by positive/negative reviews

In [40]:
from sklearn.cluster import KMeans

In [None]:
n_clusters = 2
clf_km = KMeans(n_clusters = n_clusters)
clf_km.fit(comment_train_vec)

In [None]:
cluster = clf_km.predict(comment_vec)

**Inspect the centroids**

We inspect the centroids to have a better idea what topics KMeans has figured out.

The next step is reverse the vector space back to word space to make that redeable by humains. 

In [28]:
clf_km.cluster_centers_

array([[1.23271364e-04, 2.76300209e-04, 8.46512122e-03, ...,
        3.54384110e-04, 1.16711320e-04, 8.27614753e-05],
       [1.52693934e-05, 7.38953324e-05, 3.85173994e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

**Top features per cluster**

Now we show the most relevant features in each cluster, 20 per cluster seems ought to be enough. We're interested in the most present words ake feature with the greatest represention in the centroid

We sort each centroid vector to find the top features

In [35]:
top_features_cluster = list()
n_clusters = clf_km.n_clusters

for i in range(n_clusters):
    feature_sorted = np.argsort(clf_km.cluster_centers_[i][::-1][:100])
    top_features_cluster.append(feature_sorted)
    
for num, centroid in enumerate(top_features_cluster):
    word_centroid = ", ".join(words[i] for i in centroid)
    print("%d: %s" % (num, word_centroid))

0: 05, 100, 12, 0pm, 36, 2am, 3am, 2x, 45pm, 21, 16, 1000, 10pm, 120, 48, 13, absolute, 95, 75, 34, 38, 10am, 30am, 44, 8pm, 32, 1st, 2017, 28, 3x, 7pm, 50, absurd, 150, 2pm, 300, 5th, 22, 25, 2nd, 80, 29, 19, 1pm, 18, 2018, 10, 42, 15, 6pm, 30min, 39, 99, aburi, 30, 9am, 40, 1am, 90, 5pm, abundant, 101, 500, 23, absolutely, 9pm, 8oz, abundance, 24, acai, 27, 55, 7th, 17, 65, 20, 11, 70, 3rd, 4th, 15pm, 45, 49, able, 14, 31, 33, 200, 35, 60, 21st, ability, 130, 85, ac, 30pm, a1, 26, 4pm, 3pm
1: 05, 0pm, 10, 100, 1000, absurd, 32, 30am, 75, 3x, 1st, 38, 36, 21, 120, 13, 44, 30min, 95, 15, 19, 28, 90, 80, 2x, 12, 50, 10am, absolute, 34, 45pm, 3am, 18, 2017, 16, 2pm, 2am, 7pm, 10pm, 48, 1pm, 2018, 8pm, 150, 300, 24, 11, 29, 25, 42, 3rd, 22, 5pm, 6pm, 7th, 1am, 2nd, 39, 40, abundant, 30, 5th, 99, 101, 27, 55, 9am, 8oz, 65, aburi, 23, 500, 70, absolutely, abundance, 9pm, able, 17, 4th, 20, 31, 15pm, acai, 200, 49, 33, 45, 21st, 14, 30pm, 35, 85, ac, a1, 130, ability, 26, 4pm, 3pm, 60
