This file implements an end-to-end example of the use-case of our model:

1. Identify the business for the example and collect its customer reviews
2. Load mullti-lbl2vec and segregate the reviews into 5 classes
3. Load semantic analysis model and calculate percentage of positive examples per class

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
#imports

import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import torch
import joblib
import pickle
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Kaggle direct access
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

In [None]:
#Downloads file online
!kaggle datasets download -d yelp-dataset/yelp-dataset
!unzip yelp-dataset.zip

Downloading yelp-dataset.zip to /content
100% 4.06G/4.07G [00:36<00:00, 138MB/s]
100% 4.07G/4.07G [00:36<00:00, 119MB/s]
Archive:  yelp-dataset.zip
  inflating: Dataset_User_Agreement.pdf  
  inflating: yelp_academic_dataset_business.json  
  inflating: yelp_academic_dataset_checkin.json  
  inflating: yelp_academic_dataset_review.json  
  inflating: yelp_academic_dataset_tip.json  
  inflating: yelp_academic_dataset_user.json  


In [None]:
#Reads business daraframe to get information on businesses 
#Uses jsons and creating dataframe
df_business = pd.read_json('yelp_academic_dataset_business.json', lines=True)

In [None]:
#Reads review json file. The file is very large and hence we break into chunks

size = 100000
review = pd.read_json('yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

#Gets first "size" reviews
lst = []
for chunk_review in review:
  lst.append(chunk_review)
  break
df_review = pd.concat(lst)

In [None]:
#Creating table of businesses with descending order of number of reviews along with their average star rating in our 100000 reviews subset
y = df_review.groupby(['business_id']).agg(
    number_of_reviews = pd.NamedAgg(column='review_id', aggfunc='count'), 
    average_stars = pd.NamedAgg(column='stars', aggfunc=np.mean)
    ).sort_values(by = ['number_of_reviews'], ascending = False).reset_index()

In [None]:
#Table of businesses used to identify target businesses
#Number of reviews > 300 
#Average Stars < 3.8
y.head(20)

Unnamed: 0,business_id,number_of_reviews,average_stars
0,GBTPC53ZrG1ZBY3DT8Mbcw,950,4.149474
1,PY9GRfzr4nTZeINf346QOw,460,3.943478
2,W4ZEKkva9HpAdZG88juwyQ,433,4.221709
3,vN6v8m4DO45Z4pp8yxxF_w,404,4.457921
4,pSmOH4a3HNNpYM82J5ycLA,384,3.911458
5,8uF-bhJFgT4Tn6DTb27viA,365,4.419178
6,SZU9c8V2GuREDN5KgyHFJw,351,3.931624
7,Zi-F-YvyVOK0k5QD7lrLOg,342,4.371345
8,UCMSWPqzXjd7QHq7v8PJjQ,341,4.445748
9,M0r9lUn2gLFYgIwIfG8-bQ,327,3.743119


In [None]:
#Chosen business based on criterion
business_id = y["business_id"][9]

In [None]:
#The business for which we will do a full end to end model
#We find business in the top 20 frequency of reviews with stars < 4.0

df_business[df_business["business_id"] == business_id]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
11426,M0r9lUn2gLFYgIwIfG8-bQ,Baileys' Range,920 Olive St,Saint Louis,MO,63101,38.628864,-90.194542,3.5,1567,1,"{'OutdoorSeating': 'False', 'RestaurantsAttire...","Ice Cream & Frozen Yogurt, Burgers, Food, Nigh...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."


The found business is Baileys'Range, Saint Louis. It has an average star rating of 3.5 in the complete data set and 3.7 in our 100000 reviews. We will run our two models - review segregator & sentiment analyzer - to give recommendations so that its rating can improve!

In [None]:
#Combine all reviews of business_id = M0r9lUn2gLFYgIwIfG8-bQ into csv file
df_reviews_business = df_review[df_review['business_id'] == business_id].reset_index()

In [None]:
df_reviews_business

Unnamed: 0,index,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,208,651IreQ5u5j2WiVID_HK6A,Zn8uhC3DjoKjjwBiuM8oQg,M0r9lUn2gLFYgIwIfG8-bQ,3,1,0,0,"Average or slightly above, despite the hype. ...",2015-11-06 02:27:35
1,400,3aCWNBbYgO6dv1MjUEVL0g,E1olskL76b9TrcClgD9oBw,M0r9lUn2gLFYgIwIfG8-bQ,4,0,0,0,A great spot to take out-of-town guests. I tak...,2016-06-22 21:47:29
2,507,ii4RHFJZ3srDe5XlCKEaIQ,cWAKzsMt0iMjBNPuNJVgDQ,M0r9lUn2gLFYgIwIfG8-bQ,4,0,0,0,The burgers were very good very flavorful. I h...,2016-08-03 22:45:57
3,525,JaGqBpLhxvC27juhu0lGwA,n2pX5Ae8xCUi2_WlwcTkXQ,M0r9lUn2gLFYgIwIfG8-bQ,5,0,0,0,"I loved this place. Walking by, I almost didn...",2013-08-03 18:25:32
4,1296,NXVHlbvlWcwPaD5Et04uMQ,SQtIhJLfyPCSQvoA-qy4uA,M0r9lUn2gLFYgIwIfG8-bQ,2,1,0,0,"Burger was OK, but not at all worth the price....",2014-09-09 18:46:04
...,...,...,...,...,...,...,...,...,...,...
322,98107,HlCOuEsIhFtflsiZd26z0g,unVZ7sQs8w7N4ZB0LZgukw,M0r9lUn2gLFYgIwIfG8-bQ,4,0,0,0,This meal was so wonderful! Bailey's is casua...,2013-04-17 12:45:29
323,98219,AjSaCIZka2PYOAb4qbus2A,oJqCSWW_KFwX09yDWlt5WQ,M0r9lUn2gLFYgIwIfG8-bQ,1,0,0,0,Small portions. Overpriced. Bad service. Healt...,2016-08-06 02:00:29
324,98769,dLiyLMeVFyYaRZIS2NrESA,zSfUm57pBJ1I2nw8lVgARg,M0r9lUn2gLFYgIwIfG8-bQ,4,0,0,0,I had been wanting to go to this place for wha...,2012-06-04 19:17:15
325,99524,IE4uo4IBAQfO3VTa8ePLcQ,tcOrgw2kXjfVwUVY3wBzcg,M0r9lUn2gLFYgIwIfG8-bQ,3,0,0,0,Good ice cream made into a shake.. horrible e...,2016-11-16 02:37:52


In [17]:
#Reading and storing csv file

#df_reviews_business.to_csv("business_specific_reviews.csv")
df_reviews_business = pd.read_csv("/content/drive/My Drive/CS 7650 Final Project/business_specific_reviews.csv")

In [18]:
#Using pretrained review segregator
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

#Loading trained doc2vec_model
doc2vec_model = Doc2Vec.load("/content/drive/My Drive/CS 7650 Final Project/doc2vec_model_large")

In [19]:
#Get vector representations of reviews for business
data = None
for i in range(len(df_reviews_business)):
  index = df_reviews_business["index"].iloc[i]
  if index < 70000: #We used 70000 reviews for doc2vec model
    if i == 0:
      data = torch.tensor(doc2vec_model.docvecs[index].reshape((1, 300)))
    else:
      y = torch.tensor(doc2vec_model.docvecs[i].reshape((1, 300)))
      data = torch.cat((data, y), 0)

  else: #if review number > 700000 We wiil infer the vector using our model 
    vec = doc2vec_model.infer_vector(nltk.word_tokenize(df_reviews_business["text"].iloc[i]), alpha=None, min_alpha=None, epochs=25)
    if i == 1:
      data = torch.tensor(vec.reshape((1, 300)))
    else:
      y = torch.tensor(vec.reshape((1, 300)))
      data = torch.cat((data, y), 0)

In [20]:
#Use review_segregator model on vector representation:

#Import trained label vectors:

from numpy import loadtxt
labels = loadtxt("/content/drive/My Drive/CS 7650 Final Project/labels.csv", delimiter=",")

In [21]:
#Predict top 2 label class for each review of business

import torch
import torch.nn as nn
import random
import numpy as np

idxToStringMap = {0:"Ambience", 1:"Food", 2:"Service", 3:"Price", 4:"Time"}


def predict_model_reviews(keywordEmbeddings, data):
  
  AmbienceVec = torch.tensor(labels[0])
  TasteVec = torch.tensor(labels[1])
  ServiceVec = torch.tensor(labels[2])
  PriceVec = torch.tensor(labels[3])
  TimeVec = torch.tensor(labels[4])

  labelsList = [AmbienceVec, TasteVec, ServiceVec, PriceVec, TimeVec]

  similarityMatrix = torch.zeros(data.shape[0], len(labelsList) + 1)

  topTwoPredictedReviewsList = []

  matchingLabelSimilarityScore = 0
  for rowIdx in range(len(data)):
    for colIdx in range(len(labelsList)):
      cos = nn.CosineSimilarity(dim = 0)
      similarityMatrix[rowIdx][colIdx] = float(cos(data[rowIdx], labelsList[colIdx]))

    topIdx, secondIdx = torch.topk(similarityMatrix[rowIdx][0 : -1], 2)[1]
    topIdx = int(topIdx)
    secondIdx = int(secondIdx)


    topTwoPredictedReviewsList.append((idxToStringMap[topIdx], idxToStringMap[secondIdx]))
    matchingLabelSimilarityScore += (similarityMatrix[rowIdx][topIdx] + similarityMatrix[rowIdx][secondIdx]) / 2

    nonMatchingLabelsSimilarityScore = 0
    for idx in range(len(similarityMatrix[rowIdx])):
      if idx != topIdx and idx != secondIdx:
        nonMatchingLabelsSimilarityScore += similarityMatrix[rowIdx][idx]
    
    similarityMatrix[rowIdx][-1] = nonMatchingLabelsSimilarityScore / (len(labelsList) - 2)
    
  print("Average Similarity for All Reviews: ")
  print(matchingLabelSimilarityScore / data.shape[0])


  print("Average Similarity for Non-Matching Labels: ")
  print(torch.mean(similarityMatrix, 0)[-1])

  print("Predicted Labels for Training: ")
  print(topTwoPredictedReviewsList)
  print()

  return topTwoPredictedReviewsList 

predictions = predict_model_reviews(None, data)

Average Similarity for All Reviews: 
tensor(0.3532)
Average Similarity for Non-Matching Labels: 
tensor(0.2524)
Predicted Labels for Training: 
[('Food', 'Service'), ('Service', 'Time'), ('Price', 'Service'), ('Ambience', 'Service'), ('Price', 'Service'), ('Service', 'Food'), ('Price', 'Food'), ('Service', 'Price'), ('Time', 'Service'), ('Service', 'Food'), ('Service', 'Ambience'), ('Service', 'Ambience'), ('Service', 'Ambience'), ('Service', 'Food'), ('Time', 'Service'), ('Service', 'Price'), ('Service', 'Time'), ('Service', 'Food'), ('Service', 'Ambience'), ('Service', 'Price'), ('Food', 'Service'), ('Food', 'Service'), ('Service', 'Price'), ('Service', 'Ambience'), ('Food', 'Service'), ('Service', 'Price'), ('Service', 'Ambience'), ('Food', 'Service'), ('Service', 'Ambience'), ('Service', 'Food'), ('Service', 'Price'), ('Price', 'Service'), ('Service', 'Time'), ('Service', 'Ambience'), ('Food', 'Time'), ('Price', 'Service'), ('Time', 'Service'), ('Price', 'Ambience'), ('Service', 'F

In [22]:
#Segregate reviews based on labels

mp = {"Ambience":[], "Food":[], "Service":[],"Price":[], "Time":[]}

for i in range(len(predictions)):
  review_text = df_reviews_business["text"].iloc[i]
  pred1, pred2 = predictions[i]

  mp[pred1].append(review_text)
  mp[pred2].append(review_text)

reviews_ambience = mp["Ambience"]
reviews_food = mp["Food"]
reviews_services = mp["Service"]
reviews_price = mp["Price"]
reviews_time = mp["Time"]

In [23]:
#Total number of reviews
print("Total number of reviews are: " + str(len(data)))

Total number of reviews are: 327


In [24]:
#Number of reviews per class
count = 0
for key, value in mp.items():
  print(key+": " + str(len(value)) + " reviews (" + str(len(value)*100/len(data)) + " percent)")
  count += len(value)
  print()

Ambience: 49 reviews (14.984709480122325 percent)

Food: 141 reviews (43.11926605504587 percent)

Service: 251 reviews (76.75840978593273 percent)

Price: 130 reviews (39.75535168195719 percent)

Time: 83 reviews (25.382262996941897 percent)



In [25]:
def predict_sentiment(data):

  # define helper functions
  def load_count_vectorizer(): 
    # load count vectorizer 
    with open('/content/drive/My Drive/CS 7650 Final Project/count_vectorizer.pkl', 'rb') as f: 
        count_vectorizer = pickle.load(f)
    
    # return count vectorizer 
    return count_vectorizer
  
  def load_tfidf_transformer(): 
    # load count vectorizer 
    with open('/content/drive/My Drive/CS 7650 Final Project/tfidf_transformer.pkl', 'rb') as f: 
        tfidf_transformer = pickle.load(f)
    
    # return count vectorizer 
    return tfidf_transformer

  def load_random_forest(): 
    # load random forest model 
    random_forest = joblib.load('/content/drive/My Drive/CS 7650 Final Project/random_forest.joblib')

    # return loaded model 
    return random_forest 

  def get_clean_tokens(review): 
    # tokenize review 
    tokens = review.split() 

    # clean each token 
    clean_tokens =[]
    for token in tokens: 
      token = token.strip() 

      # remove non-alphaneumeric characters
      regex = re.compile('[^a-zA-Z0-9]')
      token = regex.sub('', token)

      # record cleaned token
      if len(token) != 0: 
        clean_tokens.append(token)

    # return list of clean tokens 
    return clean_tokens

  # load stemmer and vectorizers 
  snowball_stemmer = SnowballStemmer(language='english')
  count_vectorizer = load_count_vectorizer()
  tfidf_transformer = load_tfidf_transformer() 
  random_forest = load_random_forest() 

  rf_predictions_pos = []
  rf_predictions_neg = []

  # find predictions for each review
  for index, d in data.iterrows():
    # transform review 
    review = d['text']
    try: 
        eng_stopwords = stopwords.words('english')
    except: 
        nltk.download('stopwords')
        eng_stopwords = stopwords.words('english')
    tokens = get_clean_tokens(review)
    new_tokens = []
    for token in tokens: 
        new_tokens.append(snowball_stemmer.stem(token)) 
    tokens = new_tokens 
    tokens = list(filter(lambda token: token not in eng_stopwords, tokens))
    review_counts = count_vectorizer.transform(pd.DataFrame([review])[0])
    review_tfidf = tfidf_transformer.transform(review_counts)

    rf_prediction = random_forest.predict_proba(review_tfidf)[0]
    
    rf_predictions_pos.append(rf_prediction[1])
    rf_predictions_neg.append(rf_prediction[0])

  # append prediction probabilities into dataframe
  data["RF Positive"] = rf_predictions_pos
  data["RF Negative"] = rf_predictions_neg

  data["Output"] = np.where(data['RF Positive'] >= data["RF Negative"], 1, 0)

  # print percentage of positive reviews
  percent_positive = data["Output"].mean()
  #print("Percentage of positive reviews: ", percent_positive)

  return data, percent_positive

In [26]:
#Predict sentiment of reviews per label 

for key, value in mp.items():
  data = pd.DataFrame(value, columns =['text'])
  results_df, percent_positive = predict_sentiment(data)
  print(key+": "+ str(percent_positive) + " of the reviews are positive")
  print(key+": "+ str(1-percent_positive) + " of the reviews are negative")
  print()

Ambience: 0.7959183673469388 of the reviews are positive
Ambience: 0.20408163265306123 of the reviews are negative

Food: 0.8226950354609929 of the reviews are positive
Food: 0.17730496453900713 of the reviews are negative

Service: 0.8366533864541833 of the reviews are positive
Service: 0.1633466135458167 of the reviews are negative

Price: 0.8307692307692308 of the reviews are positive
Price: 0.16923076923076918 of the reviews are negative

Time: 0.8554216867469879 of the reviews are positive
Time: 0.14457831325301207 of the reviews are negative

