
RUN AS JUPYTER NOTEBOOK AS COLAB DOES NOT SUPPORT PYTHON VERSION >= 3.8

This file implements out of box standard lbl2vec library:

1. Analyzes reviews to find top 5 labels and their associated keywords
2. Trains doc2vec model to get vectors for each document
3. Imports standard lbl2vec library and uses the trained doc2vec model
3. Adds custom f-1 scorer molded for multiclass labeling

In [None]:
#imports
from lbl2vec import Lbl2Vec
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.parsing.preprocessing import strip_tags
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
# import torch

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aditjindal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#Kaggle direct access - jupyter

!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset")

In [None]:
#Reads review json file. The file is very large and hence we break into chunks

size = 35000
review = pd.read_json('yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

#Gets first "size" reviews
lst = []
for chunk_review in review:
  lst.append(chunk_review)
  break
df_review = pd.concat(lst)

In [None]:
#Reads review files

df_reviews = pd.read_csv("raw_reviews_labelled.csv")
df_reviews_large = pd.read_csv("reviews_100000.csv")

In [None]:
#Preprocessing:
#Tokenization

df_reviews = df_reviews.drop(df_reviews.columns[0], axis=1)
df_reviews_large = df_reviews_large.drop(df_reviews_large.columns[0], axis=1)
reviews = df_reviews["text"].apply(nltk.word_tokenize)
reviews_large = df_reviews_large["text"].apply(nltk.word_tokenize)

In [None]:
#Create mappings between tokens and indices.
#Code Inspired from CS 7650 Projects

from collections import Counter

wordCounts = Counter([w for l in reviews for w in l])

#Build dictionaries to map from words, characters to indices and vice versa.
#Save first two words in the vocabulary for padding and "UNK" token.

padding_token = 0
unk_token = 1

word2i = {w:i+2 for i,w in enumerate(set([w for l in reviews for w in l]))}
i2word = {i:w for w,i in word2i.items()}
vocab_size = max(word2i.values()) + 1

#Map a list of sentences from words to indices.
def sentences2indices(reviews, dictionary=word2i):
    return [[dictionary.get(w, unk_token) for w in l] for l in reviews]
    
def indices2sentence(review, dictionary=i2word):
    return [dictionary.get(index, "UNK") for index in review]

#Indices
X = sentences2indices(reviews, word2i)

In [None]:
#Sample data
#Code Inspired from CS 7650 Projects
print("vocab size:", vocab_size)
print()

print("index of word 'the':", word2i["the"])
print("word of index 47983:", i2word[47983])
print()

for i in range(2):
    print(" ".join([i2word.get(w,'UNK') for w in X[i]]))

print()

print(X[0])
print(indices2sentence(X[0]))

vocab size: 66333

index of word 'the': 29973
word of index 47983: murals

If you decide to eat here , just be aware it is going to take about 2 hours from beginning to end . We have tried it multiple times , because I want to like it ! I have been to it 's other locations in NJ and never had a bad experience . The food is good , but it takes a very long time to come out . The waitstaff is very young , but usually pleasant . We have just had too many experiences where we spent way too long waiting . We usually opt for another diner or restaurant on the weekends , in order to be done quicker .
I 've taken a lot of spin classes over the years , and nothing compares to the classes at Body Cycle . From the nice , clean space and amazing bikes , to the welcoming and motivating instructors , every class is a top notch work out . For anyone who struggles to fit workouts in , the online scheduling system makes it easy to plan ahead ( and there 's no need to line up way in advanced like many gy

In [None]:
# Step 3: Creating topics list and their associated keywords 

In [None]:
#Finding top 360 words based on frequency

wordCounts.most_common(360)

[('.', 219407),
 ('the', 147451),
 (',', 124723),
 ('and', 123083),
 ('I', 100634),
 ('a', 90495),
 ('to', 80632),
 ('was', 67300),
 ('of', 50741),
 ('is', 44822),
 ('!', 43322),
 ('it', 41850),
 ('for', 40609),
 ('in', 38497),
 ('The', 34295),
 ('with', 28983),
 ('that', 28668),
 ('but', 25558),
 ('my', 24682),
 ('you', 24550),
 ('on', 23955),
 ("n't", 22852),
 ('have', 21821),
 ('had', 21558),
 ("'s", 20791),
 ('were', 20685),
 ('this', 20372),
 ('they', 19618),
 ('we', 19187),
 ('not', 18881),
 ('are', 18319),
 ('food', 17530),
 ('place', 16608),
 ('at', 16604),
 ('good', 16587),
 ('so', 15569),
 ('be', 14935),
 (')', 14347),
 ('as', 13585),
 ('(', 13312),
 ('great', 13145),
 ('very', 12916),
 ('We', 12739),
 ('here', 11856),
 ('out', 11796),
 ('me', 11735),
 ('there', 11426),
 ('It', 10763),
 ('like', 10562),
 ('all', 10550),
 ('time', 10396),
 ('our', 10371),
 ('just', 9955),
 ('get', 9884),
 ('would', 9766),
 ('service', 9696),
 ('from', 9576),
 ('do', 9489),
 ('back', 9243),
 ('

In [None]:
#Finding 95th percentile of frequencies for creating a cutoff frequency to identify keywords
np.percentile(list(wordCounts.values()), 95)

70.0

In [None]:
#Creates Labels and keywords
labels_mp = {}
labels_mp["Ambience"] = ["place","experience","bar","area", "location","clean","music","atmosphere", "environment", "patio", "rooftop", "seating", "decor", "lighting", "vibe"]
labels_mp["Food"] = ["food","delicious","menu","chicken","fresh","cheese","sauce","eat","pizza","meal","salad","coffee","burger","hot","tasty", "flavor","yum", "beer"]
labels_mp["Service"] = ["service","staff", "friendly","people","happy","server","professional", "hire", "waiter","rude","attentive", "tip"]
labels_mp["Price"] = ["$","price","worth","dollars","free", "cost", "expensive", "money", "cheap", "overpriced", "economical", "luxury", "reasonable"]
labels_mp["Time"] = ["time", "wait","room","table","2", "minutes","hour","long","weekends", "busy", "reservations", "slow", "crowded", "rush", "fast"]

label_keys = list(labels_mp.values())
label_names = list(labels_mp.keys())

In [None]:
# Documents: a list of TaggedDocuments (each document (review) is represented as a list of tokens)
#reviews_df = pd.DataFrame(reviews, columns=['tokens'])
#reviews_df['tagged_documents'] = reviews_df.apply(lambda row: TaggedDocument(row['tokens'], [str(row.name)]))

reviews2 = reviews_large.copy()
test_data = reviews2[:200]
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews2[200:70200])]

# Define Doc2Vec training parameters
doc2vec_parameters = {"documents": documents,
                      "epochs": 10,
                      "vector_size": 300,
                      "min_count": 35,
                      "window": 15,
                      "sample": 1e-5,
                      "negative": 5,
                      "workers": 3,
                      "hs": 1,
                      "dm": 0,
                      "dbow_words": 1}

# Create Doc2Vec Model
doc2vec_model = Doc2Vec(**doc2vec_parameters)
# doc2vec_model.save("doc2vec_model_large")

In [None]:
#Loading trained doc2vec_model

# doc2vec_model = Doc2Vec.load("/content/drive/My Drive/CS 7650 Final Project/doc2vec_model_large") #doc2vec trained on 70K reviews
# doc2vec_model = Doc2Vec.load("/content/drive/My Drive/CS 7650 Final Project/doc2vec_model") #doc2vec trained on 30k reviews

#Running lbl2vec
lbl2vec_model = Lbl2Vec(keywords_list=label_keys, doc2vec_model=doc2vec_model, 
                        label_names=label_names, similarity_threshold = 0.4,
                        epochs = 10)

In [None]:
#Fitting lbl2vec_model
lbl2vec_model.fit()

2022-12-05 12:25:34,592 - Lbl2Vec - INFO - Load document and word embeddings
2022-12-05 12:25:34,594 - Lbl2Vec - INFO - Train label embeddings


In [None]:
#Predicting documents
model_docs_lbl_similarities = lbl2vec_model.predict_model_docs()

2022-12-05 12:25:40,331 - Lbl2Vec - INFO - Get document embeddings from model
2022-12-05 12:25:40,402 - Lbl2Vec - INFO - Calculate document<->label similarities


In [None]:
#Shows similiarity score on training data
model_docs_lbl_similarities

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Ambience,Food,Service,Price,Time
0,0,Price,0.287185,0.242892,0.095695,0.209143,0.287185,0.271835
1,1,Food,0.455057,0.293761,0.455057,0.244224,0.273743,0.320072
2,2,Price,0.603554,0.555430,0.519190,0.583959,0.603554,0.585078
3,3,Service,0.534049,0.510943,0.496424,0.534049,0.526653,0.532441
4,4,Food,0.432735,0.308291,0.432735,0.265216,0.342118,0.345178
...,...,...,...,...,...,...,...,...
69995,69995,Service,0.360154,0.277638,0.232137,0.360154,0.338433,0.261580
69996,69996,Service,0.310787,0.221131,0.112703,0.310787,0.263512,0.200335
69997,69997,Service,0.758122,0.729824,0.706887,0.758122,0.751771,0.730500
69998,69998,Food,0.505485,0.483829,0.505485,0.501684,0.481419,0.459763


In [None]:
#Evaluation on labeled data
test_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(test_data)]
model_docs_lbl_similarities2 = lbl2vec_model.predict_new_docs(test_data)

2022-12-05 12:28:57,748 - Lbl2Vec - INFO - Calculate document embeddings
2022-12-05 12:28:57,980 - Lbl2Vec - INFO - Calculate document<->label similarities


In [None]:
model_docs_lbl_similarities2

Unnamed: 0,doc_key,most_similar_label,highest_similarity_score,Ambience,Food,Service,Price,Time
0,0,Service,0.412309,0.310322,0.234235,0.412309,0.361744,0.410905
1,1,Service,0.415801,0.337938,0.226146,0.415801,0.371169,0.313575
2,2,Food,0.442460,0.412480,0.442460,0.333177,0.389110,0.332117
3,3,Food,0.427933,0.353788,0.427933,0.360483,0.353473,0.368821
4,4,Food,0.373938,0.320274,0.373938,0.294779,0.283225,0.300572
...,...,...,...,...,...,...,...,...
195,195,Ambience,0.571758,0.571758,0.499892,0.547320,0.542303,0.476021
196,196,Ambience,0.570258,0.570258,0.478917,0.467977,0.500035,0.519603
197,197,Food,0.360710,0.297469,0.360710,0.217079,0.283373,0.295104
198,198,Food,0.335494,0.290741,0.335494,0.299271,0.241890,0.316849


In [None]:
#Finds top 2 labels
mp = {}
mp[0] = 'Ambience'
mp[1] = 'Food'
mp[2] = 'Service'
mp[3] = 'Price'
mp[4] = 'Time'

pred1 = []
pred2 = []

for i in range(len(model_docs_lbl_similarities2)):
    val = []
    val.append(model_docs_lbl_similarities2.iloc[i]['Ambience'])
    val.append(model_docs_lbl_similarities2.iloc[i]['Food'])
    val.append(model_docs_lbl_similarities2.iloc[i]['Service'])
    val.append(model_docs_lbl_similarities2.iloc[i]['Price'])
    val.append(model_docs_lbl_similarities2.iloc[i]['Time'])
    
    indices = np.argsort(val)[-2:][::-1]
    probabilities = sorted(val, reverse=True)[:2]
    indices = [mp[i] for i in indices]
    
    pred1.append(indices[0])
    pred2.append(indices[1])

In [None]:
# Saves model
lbl2vec_model.save("lib2vec")

In [None]:
#Getting truth values of first 200 reviws

labels = pd.read_csv("Data.csv")
label_1_true = labels["Label-1"][:200]
label_2_true = labels["Label-2"][:200]

In [None]:
#Calculates f1 score

print("f1 score on comparing true label-1 with pred1: " + str(f1_score(list(label_1_true), pred1, average='micro')))
print("f1 score on comparing true label-1 with pred2: "+str(f1_score(list(label_1_true), pred2, average='micro')))
print("f1 score on comparing true label-2 with pred1: " + str(f1_score(list(label_2_true), pred1, average='micro')))
print("f1 score on comparing true label-2 with pred2: "+str(f1_score(list(label_2_true), pred2, average='micro')))

f1 score on comparing true label-1 with pred1: 0.47500000000000003
f1 score on comparing true label-1 with pred2: 0.15
f1 score on comparing true label-2 with pred1: 0.18
f1 score on comparing true label-2 with pred2: 0.225


In [None]:
#Using New f-1 score to take into consideration top-2 labels
#Predict top 2 label class for each review of business
#Notice how this new f-1 score makes a better understanding of the model's performance

idxToStringMap = {0:"Ambience", 1:"Food", 2:"Service", 3:"Price", 4:"Time"}
StringToidxMap = {"Ambience":0, "Food":1, "Service":2, "Price":3, "Time":4}

classScoresMatrix = np.zeros((5, 3))
for i in range(len(pred1)):
    
    if not type(label_1_true[i]) == float and not type(label_2_true[i]) == float:
      
        if pred1[i] in (label_1_true[i], label_2_true[i]):
            classScoresMatrix[StringToidxMap[pred1[i]]][0] += 1
        else:
            classScoresMatrix[StringToidxMap[pred1[i]]][1] += 0.5
            
        if pred2[i] in (label_1_true[i], label_2_true[i]):
            classScoresMatrix[StringToidxMap[pred2[i]]][0] += 1
        else:
            classScoresMatrix[StringToidxMap[pred2[i]]][1] += 0.5

        if label_1_true[i] not in (pred1[i], pred2[i]):
            classScoresMatrix[StringToidxMap[label_1_true[i].strip()]][2] += 0.5
        if label_2_true[i] not in (pred1[i], pred2[i]):
            classScoresMatrix[StringToidxMap[label_2_true[i].strip()]][2] += 0.5
        
print(classScoresMatrix)

f1Scores = []

for idx in range(len(classScoresMatrix)):
    precision = classScoresMatrix[idx][0] / (classScoresMatrix[idx][0] + classScoresMatrix[idx][1])
    recall = classScoresMatrix[idx][0] / (classScoresMatrix[idx][0] + classScoresMatrix[idx][2])

    f1Score = (2 * recall * precision) / (recall + precision)
    f1Scores.append(f1Score)

    print("F1 Score for " + str(idxToStringMap[idx]) + ": " + str(float(f1Score)))
    print()
    
print("Macro F1 Score : " + str(sum(f1Scores) / len(f1Scores) ))

[[32.  13.  11. ]
 [65.   4.  28.5]
 [49.  12.  16.5]
 [15.  19.  14. ]
 [17.  30.   8. ]]
F1 Score for Ambience: 0.7272727272727273

F1 Score for Food: 0.8

F1 Score for Service: 0.774703557312253

F1 Score for Price: 0.47619047619047616

F1 Score for Time: 0.4722222222222222

Macro F1 Score : 0.6500777965995358
