# Libraries

In [1]:
import sys
pathModulesES = '../sauceforyall/'
sys.path.append(pathModulesES)

**Elasticsearch Query**

In [2]:
from yelpquery import YelpQuery
from pandasticsearch import Select
ye = YelpQuery()

**Machine Learning**

In [3]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

**Visualize**

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns

**Others**

In [6]:
import re
import string

**Warning**

In [44]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

**Index name**

In [7]:
index_business = "yelp-business*"
index_review = "yelp-review*"
index_tip = "yelp-tip*"
index_user = "yelp-user*"

In [8]:
yelp_path = "/home/server/yelp_dataset/"
business_file = yelp_path + "business.json"
review_file = yelp_path + "review.json"
tip_file = yelp_path + "tip.json"
user_file = yelp_path + "user.json"
checkin_file = yelp_path +  "checkin.json"

# 1. Data Pre-Processing

## 1.1 Load data

Retrieve all the reviews from the last 3 years, this helps reduce the volume of data

In [9]:
#firstReview = ye.getRangeTerm(index=index_review, term="date", beginDate="2017-01-01T00:00:00.000",endDate="2019-12-31T23:59:59.000", size=2500)

In [10]:
#df_review = ye.getResultScrolling(firstReview)

In [11]:
df_review = pd.read_json("./review_cleaned_2018.json", lines=True)

In [12]:
example = df_review[df_review["review_id"] == "Fp1cxGOP-e-sFONFiwypPw"]["text"]

In [13]:
example

20    1 star be for their food. It be pretty good i ...
Name: text, dtype: object

## 1.2 Text Cleaning

### 1.2.1 Remove Carriage Return and Newline Characters

In [11]:
def remove_carriage(text):
    REPLACE_NEW_LINE = re.compile('\s*\n+\s*')
    text = REPLACE_NEW_LINE.sub(' ', text)
    return text

def remove_spaces(text):
    REPLACE_SPACE = re.compile('\s+')
    text = REPLACE_SPACE.sub(' ', text)
    return text

### 1.2.2 Remove Strange Character

In [12]:
def remove_weird_character(text):
    REPLACE_NO_SPACE = re.compile('[!"#$%&\()/<=>@[\\]^_`{|}~]')
    #REPLACE_NO_SPACE = re.compile('[!"#$%&\()*+-/<=>?@[\\]^_`{|}~]')
    text = REPLACE_NO_SPACE.sub(' ', text)
    return text

### 1.2.3 Remove Contractions

In [13]:
def remove_contractions(text):
    patterns = [
    (r'won\'t', 'will not'),
    (r'can\'t', 'cannot'),
    (r'i\'m', 'i am'),
    (r'I\'m', 'I am'),
    (r'ain\'t', 'is not'),
    (r'(\w+)\'ll', '\g<1> will'),
    (r'(\w+)n\'t', '\g<1> not'),
    (r'(\w+)\'ve', '\g<1> have'),
    (r'(\w+)\'s', '\g<1> is'),
    (r'(\w+)\'re', '\g<1> are'),
    (r'(\w+)\'d', '\g<1> would'),
    ]
    
    for (pattern, repl) in patterns:
        text = re.sub(pattern, repl, text)
    return text

### 1.2.4 Remove Repeated Characters

In [14]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to /home/server/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/server/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/server/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [15]:
def remove_repeated(word):
    repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    repl = r'\1\2\3'
    if wordnet.synsets(word):
        return word
    
    repl_word = repeat_regexp.sub(repl, word)
    
    if repl_word != word:
        return remove_repeated(repl_word)
    else:
        return repl_word

In [16]:
test_text = 'Awesome food anyone??? Go check out Modern Steak in Scottsdale, AZ.\r\nGorgeous dining room! Excellent service (with our server Gabe)! And the food was AAAMMMAZZING!\r\n\r\nIt\'s located at Fashion Mall, but it\'s NOT a "mall" restaurant. A MUST go!'

In [17]:
print(remove_repeated("AAAMMMAZZING!"))

AMAZING!


### 1.2.5 Check Spelling

In [18]:
##TODO
def spelling(word):
    pass

### 1.2.6 Lemmatization

In [19]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_sentence(document):
    lemmatizer = WordNetLemmatizer()
    sentences = nltk.sent_tokenize(document)
    
    tagged_sentences = []
    for sent in sentences:
        sent = nltk.word_tokenize(sent)
        corrected_words = []
        for word in sent:
            word = remove_repeated(word)
            #word = spelling(word)
            word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
            corrected_words.append(word)
        
        corrected_words = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in corrected_words]).strip()      
        tagged_sentences.append(corrected_words)
        
    tagged_sentences = " ".join(tagged_sentences)
    
    #lemmatized_sentence = []
    #for word, tag in pos_tag(tokens):
    #    if tag.startswith('NN' ):
    #        pos = wordnet.NOUN 
    #    elif tag.startswith('VB'):
    #        pos = wordnet.VERB
    #    elif tag.startswith('RB'):
    #        pos = wordnet.ADV
    #    elif tag.startswith('JJ'):
    #        pos = wordnet.ADJ
    #    else:
    #        pos = wordnet.NOUN
    #    lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return tagged_sentences

## 1.3 Text Processing

**Example**

In [22]:
a = example[20]
print(a)

KeyError: 20

In [None]:
b = remove_carriage(a)
c = remove_weird_character(b)
d = remove_spaces(c)
e = remove_contractions(d)
f = lemmatize_sentence(e)

In [None]:
f

In [20]:
#Remove carriage
df_review["text"] = df_review["text"].apply(lambda x: remove_carriage(x))

In [21]:
#Remove weird characters
df_review["text"] = df_review["text"].apply(lambda x: remove_weird_character(x))

In [22]:
#Remove double whitespace
df_review["text"] = df_review["text"].apply(lambda x: remove_spaces(x))

In [None]:
#Remove contractions
df_review["text"] = df_review["text"].apply(lambda x: remove_contractions(x))

In [None]:
#Lemmatization
df_review["text"] = df_review["text"].apply(lambda x: lemmatize_sentence(x))

In [None]:
df_review = df_review.drop("_index", axis = 1)
df_review = df_review.drop("_type", axis = 1)
df_review = df_review.drop("_id", axis = 1)
df_review = df_review.drop("@timestamp", axis = 1)
df_review = df_review.drop("@version", axis = 1)
df_review = df_review.drop("_score", axis = 1)

In [None]:
df_review.to_json("./review_cleaned_2017_2019.json", orient='records', lines=True)

# Features Engineering

**Feature/Target**

In [12]:
comments = df_review["text"]
target = df_review["stars"]

In [13]:
df_review['stars'].value_counts()

5    600250
1    205182
4    190985
3     99910
2     81335
Name: stars, dtype: int64

**Train/Test Split**

In [14]:
comment_train, comment_test, target_train, target_test = \
train_test_split(comments, target, random_state = 1, test_size=0.3)

# NLP Representation

**TF-IDF**

In [15]:
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True, max_features = 5000,
                             ngram_range = (1, 1)
                            )

Train the model with the users comments vectorized

In [16]:
tfidf = vectorizer.fit(comment_train)
#pickle.dump(tfidf, open("tfidf_2018.pickle", "wb"))

In [17]:
comment_train_vec = vectorizer.transform(comment_train)
#pickle.dump(comment_train_vec, open("comment_train_2018.pickle", "wb"))

Feature extraction, in layman term we try to get the vocabulary of your TF-IDF

In [18]:
words = vectorizer.get_feature_names()

Test tfidf on test set

In [19]:
comment_test_vec = vectorizer.transform(comment_test)

# Similar review (Optional)

**Get top/bottom value (Similarity)**

In [20]:
def get_top_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]

def get_bottom_values(lst, n, labels):
    return [labels[i] for i in np.argsort(lst)[:n]]

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

#Unseen comment
arbitrary_review = np.random.choice(comment_test, 1)

In [22]:
arbitrary_review[0]

'We come here for their hibachi tepenyaki. The service be great. The server kept refill our drink and be on top of all the dirty dish. Thanks Heath The chef cook, be the downfall. Everything taste the same. Way too much garlic in everything He use the same flavor profile for chicken, steak, and shrimp. Butter, garlic, soy sauce, salt and pepper. On everything. The vegetable too Add in some sesame seed. Because why not. There be very little entertainment from the chef. The chef cook cater to the other family. He have the server bring them out sake and drank with the other family. Any extra portion be give to the other family. Unimpressed and perturbed.'

In [23]:
#Transform the random review to vector
arbitrary_review_vec = vectorizer.transform([arbitrary_review[0]])

In [24]:
similarity_score = cosine_similarity(arbitrary_review_vec, comment_train_vec)

Top 5 similar review

In [25]:
n=5
similar_reviews = get_top_values(similarity_score[0], n, list(comment_train))

In [26]:
print('Top %s similar reviews:' % n)
for i in range(n):
    print('No. %d review is %s.\n' % (i+1, similar_reviews[i]))

Top 5 similar reviews:
No. 1 review is This be a family friendly place. My family and I come here twice a month. Service be great and the tepenyaki chef keep you proactive with their cook and give a wonderful experience everytime we go Keep it up Ohjah OhjahRegulars.

No. 2 review is We be here on our anniversary and food be really delicious. Chef have put up fire show that we really enjoy .i have hibachi steak and my family order chicken and shrimp hibachi. There be all well cooked I front of u. We have a nice experience and can recommend them..

No. 3 review is Chef be great and awesome Cook the fry chicken wing in our way Very kind hearted Taste be amaze.

No. 4 review is This be the best Hibachi place in Vegas outside of the strip. I have be come here for 4 year now and nothing else compare. The chef be all good but our favorite be the head chef Eddy. Great place to bring family and food have always be cooked perfectly. Please me if you can find a well hibachi joint in Vegas- I do 

# Predection Models

## Naive Bayes Classifier

In [27]:
from sklearn.naive_bayes import MultinomialNB

In [28]:
clf_nb = MultinomialNB()
clf_nb.fit(comment_train_vec, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
score_training = clf_nb.score(comment_train_vec, target_train)
score_test = clf_nb.score(comment_test_vec, target_test)
print("Accuracy for the training set is %f, the test set is %f" % \
     (score_training, score_test))

Accuracy for the training set is 0.673133, the test set is 0.672029


## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

In [33]:
clf_logr = LogisticRegression(multi_class='ovr')
clf_logr.fit(comment_train_vec, target_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
score_training = clf_logr.score(comment_train_vec, target_train)
score_test = clf_logr.score(comment_test_vec, target_test)
print("Accuracy for the training set is %f, the test set is %f" % \
     (score_training, score_test))

Accuracy for the training set is 0.729808, the test set is 0.723506


Compared with Naive-Bayes model, Logistic Regression model improces a little bit. Let me find out the top 20 most important words given by Logistic Regression model.

In [35]:
n = 20
print('Top 20 words by ranking are %s.' % \
      (", ".join(i for i in \
        get_top_values(clf_logr.coef_[0], n, words))))

Top 20 words by ranking are worst, horrible, zero, scam, poison, disgust, terrible, waste, awful, rude, bad, unprofessional, joke, refuse, disgusting, beware, pathetic, incompetent, racist, worse.


**What are the key features(words) that make the negative prediction?**

In [36]:
print('Top 20 words for negative prediction are %s.' % \
      (", ".join(i for i in \
        get_top_values(clf_logr.coef_[0], n, words))))

Top 20 words for negative prediction are worst, horrible, zero, scam, poison, disgust, terrible, waste, awful, rude, bad, unprofessional, joke, refuse, disgusting, beware, pathetic, incompetent, racist, worse.


**What are the key features(words) that make the positive prediction?**

In [39]:
print('Top 20 words for positive prediction are %s.' % \
      (", ".join(i for i in \
        get_bottom_values(clf_logr.coef_[0], n, words))))

Top 20 words for positive prediction are delicious, great, amaze, excellent, awesome, love, best, fantastic, amazing, perfect, good, friendly, wonderful, highly, yummy, outstanding, nice, notch, thank, favorite.


## Random Forest Classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
clf_rf = RandomForestClassifier(n_estimators=100, random_state = 1)
clf_rf.fit(comment_train_vec, target_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [45]:
score_training = clf_rf.score(comment_train_vec, target_train)
score_test = clf_rf.score(comment_test_vec, target_test)
print("Accuracy for the training set is %f, the test set is %f" % \
     (score_training, score_test))

Accuracy for the training set is 0.999643, the test set is 0.682832


**What are important features (words) by inspecting the RFC model?**

In [47]:
print('Top 20 words for positive prediction are %s.' % \
      (", ".join(i for i in \
        get_top_values(clf_rf.feature_importances_, n, words))))

Top 20 words for positive prediction are great, good, best, amaze, bad, love, rude, told, horrible, delicious, order, say, food, friendly, ask, like, recommend, definitely, terrible, ok.


## Evaluate all models with Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

**Naive Bayes Classifier**

In [None]:
score_nb = cross_val_score(clf_nb, comment_train_vec, target_train, cv=10)
print("Accuracy for Naive Bayes: %f" % (score_nb.mean()))

**Logistic Regression**

In [None]:
score_logr = cross_val_score(clf_logr, comment_train_vec, target_train, cv=10)
print("Accuracy for Logistic Regression: %f" % (score_logr.mean()))

**Random Forest**

In [None]:
score_rf = cross_val_score(clf_rf, comment_train_vec, target_train, cv=10)
print("Accuracy for Random Forest: %f" % (score_rf.mean()))

# Reduce dimenionality with PCA

## Features Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(comment_train_vec)
X_test_scaled = scaler.fit_transform(comment_test_vec)

## Get Principal Components

In [None]:
from sklearn.decomposition import PCA
nb_components = 100
pca = PCA(n_components = nb_components)

In [None]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

Plot most important PCA