This file represents our novel implementation of lbl2vec called Multi-lbl2vec. We add certain features to better fit our model to the Yelp Review dataset - a multiclass unsupervised learning problem.

Additions:


1. Outlier detection using LOF
2. Max_Docs parameter
3. Multiclass F-1 scorer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Review dataframe
"""

*   Column 1 - Unique Business ID
*   Column 2 - Date of Review
*   Column 3 - Review ID
*   Column 4 - Stars given by the user
*   Column 5 - Review given by the user
*   Column 6 - Type of text entered - Review
*   Column 7 - Unique User ID
*   Column 8 - Cool column: The number of cool votes the review received
*   Column 9 - Useful column: The number of useful votes the review received
*   Column 10 - Funny Column: The number of funny votes the review received <br>

"""

In [None]:
#imports

import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
nltk.download('punkt')
import torch

In [None]:
#Kaggle direct access
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

In [None]:
#Downloads file online
!kaggle datasets download -d yelp-dataset/yelp-dataset
!unzip yelp-dataset.zip

In [None]:
#Reads jsons and creating dataframes. This cell creates review file. It is very large in size so break it into chunks of size

#size = 100000
size = 35000
review = pd.read_json('yelp_academic_dataset_review.json', lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

#Gets first 35000 reviews
lst = []
for chunk_review in review:
  lst.append(chunk_review)
  break
df_review = pd.concat(lst)

In [None]:
df_review.head()

In [None]:
# reviews = pd.read_csv("/content/drive/My Drive/CS 7650 Final Project/raw_reviews.csv")
reviews = pd.read_csv("/content/drive/My Drive/CS 7650 Final Project/reviews_100000.csv")
reviews = reviews["text"]

In [None]:
#Preprocessing:
#Tokenization

# reviews = df_review["text"].apply(nltk.word_tokenize)
reviews = reviews.apply(nltk.word_tokenize)

In [None]:
#Create mappings between tokens and indices.
#Code Inspired from CS 7650 Projects

from collections import Counter

wordCounts = Counter([w for l in reviews for w in l])

#Build dictionaries to map from words, characters to indices and vice versa.
#Save first two words in the vocabulary for padding and "UNK" token.

padding_token = 0
unk_token = 1

word2i = {w:i+2 for i,w in enumerate(set([w for l in reviews for w in l]))}
i2word = {i:w for w,i in word2i.items()}
vocab_size = max(word2i.values()) + 1

#Map a list of sentences from words to indices.
def sentences2indices(reviews, dictionary=word2i):
    return [[dictionary.get(w, unk_token) for w in l] for l in reviews]
    
def indices2sentence(review, dictionary=i2word):
    return [dictionary.get(index, "UNK") for index in review]

#Indices
X = sentences2indices(reviews, word2i)

In [None]:
#Sample data
#Code Inspired from CS 7650 Projects
print("vocab size:", vocab_size)
print()

print("index of word 'the':", word2i["the"])
print("word of index 47983:", i2word[47983])
print()

for i in range(2):
    print(" ".join([i2word.get(w,'UNK') for w in X[i]]))

print()

print(X[0])
print(indices2sentence(X[0]))

In [None]:
#Padding and truncating:

def padding(X, max_review_length=100):
  #Padding to max_review_length with 0
  X_padded = torch.nn.utils.rnn.pad_sequence([torch.as_tensor(l) for l in X], batch_first=True).type(torch.LongTensor) # padding the sequences with 0
  return X_padded[:, :max_review_length]

X_padded = padding(X, max_review_length = 100)
print(X_padded.shape)

In [None]:
# Step 3: Creating topics list and their associated keywords 

# Keywords: a list of lists with keywords (strings) that describe a corresponding review topic. For more details on how we found the keywords, look at "lbl2vec_library.ipynb"

# Labels: a list of topics to classify reviews (index of topic must correspond to the index of associated keywords)

labels_mp = {}
labels_mp["Ambience"] = ["music","atmosphere", "environment", "patio", "rooftop", "indoor", "outdoor", "seating", "location", "decor", "lighting", "vibe", "cold", "warm"]
labels_mp["Food"] = ["yum","vegan","spicy","salty","pasta","wings","sushi", "taste", "menu", "food", "delicious", "yummy", "disgusting","choices", "fresh", "flavor", "chicken","meal","curry"]
labels_mp["Service"] = ["professional", "hire","service", "server", 'waiter', "staff", "friendly", "rude", "waitstaff", "waiter","attentive", "talkative", "tip"]
labels_mp["Price"] = ["dollars","free","$", "price", "cost", "expensive", "money", "cheap", "student", "overpriced", "economical", "luxury", "reasonable"]
labels_mp["Time"] = ["time", "weekends", "busy", "reservations", "slow", "crowded", "waiting", "rush", "hours", "minutes", "long", "tables", "fast"]

keywords = list(labels_mp.values())
labels = list(labels_mp.keys())

# Create a dataframe with labels and correspoding keywords
labels_df = pd.DataFrame(list(zip(labels, keywords)), columns=['label', 'keywords'])

In [None]:
# Documents: a list of TaggedDocuments (each document (review) is represented as a list of tokens)

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews[:70000])]

# Define Doc2Vec training parameters
doc2vec_parameters = {"documents": documents,
                      "epochs": 10,
                      "vector_size": 300,
                      "min_count": 35,
                      "window": 15,
                      "sample": 1e-5,
                      "negative": 5,
                      "workers": 3,
                      "hs": 1,
                      "dm": 0,
                      "dbow_words": 1}

# Create Doc2Vec Model
doc2vec_model = Doc2Vec(**doc2vec_parameters)

In [None]:
#doc2vec_model.save("doc2vec_model_large")
doc2vec_model = Doc2Vec.load("/content/drive/My Drive/CS 7650 Final Project/doc2vec_model_large")

In [None]:
# Find documents that are similar to the keywords and add their document keys and similarity scores
def get_similar_documents(doc2vec_model, keywords, similarity_threshold, max_docs):
  document_keys = []
  similarity_scores = []

  if max_docs is None:
    max_docs = len(doc2vec_model.docvecs)

  # Filter keywords that are contained in the Doc2Vec model
  filtered_keywords_list = list(set(keywords).intersection(doc2vec_model.wv.vocab))

  # Get documents that are similar to keywords
  keyword_vectors = [doc2vec_model.wv[keyword] for keyword in filtered_keywords_list]
  similar_documents = doc2vec_model.docvecs.most_similar(positive=keyword_vectors, topn=max_docs)

  temp_document_keys = [doc[0] for doc in similar_documents]
  temp_similarity_scores = [doc[1] for doc in similar_documents]

  # Get only documents with similarity score higher than similarity threshhold
  for i in range(max_docs):
      if temp_similarity_scores[i] <= similarity_threshold:
          break
      document_keys.append(temp_document_keys[i])
      similarity_scores.append(temp_similarity_scores[i])
  
  return pd.Series([document_keys, similarity_scores], index=['document_keys', 'similarity_scores'])

similarity_threshold = 0.45
labels_df[['document_keys', 'similarity_scores']] = labels_df['keywords'].apply(lambda row: get_similar_documents(doc2vec_model, row, similarity_threshold, 5))

# Verify that every label has documents for calculating label embeddings
if len(labels_df[labels_df['document_keys'].str.len() != 0]) != len(labels_df):
  print('Model did not find documents for every label.') # Solution: Lower similarity_threshhold to increase the number of similar documents found for each label

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Get document vectors from document keys
def get_document_vectors(doc2vec, doc_keys):
  return [doc2vec[key] for key in doc_keys]

labels_df['document_vectors'] = labels_df['document_keys'].apply(lambda row: get_document_vectors(doc2vec_model, row))

# Calculate centroid of document vectors as new label vector
def get_centroid(doc_vectors):
  vectors = np.array(doc_vectors)
  num_vectors, vector_dims = vectors.shape
  return np.array([np.sum(vectors[:, i]) / num_vectors for i in range(vector_dims)])


#### Outlier Detection ###

#We use Local Outlier Factor Method

def remove_outliers(doc_vectors):
      l = len(doc_vectors)
      #Only remove outliers if number of doc_vectors per label is more than 1
      #We experiement with the number of nearest neighbors
      k = 2

      if l >= 2:
          if l < k:
              n_neighbors = l
          else:
              n_neighbors = k
          lof_predictions = LocalOutlierFactor(n_neighbors).fit_predict(doc_vectors)
          return [doc_vectors[i] for i in range(len(lof_predictions)) if lof_predictions[i] == 1]
      else:
          return doc_vectors

labels_df['document_vectors'] = labels_df['document_vectors'].apply(lambda row: remove_outliers(row))

##########################

labels_df['label_vector'] = labels_df['document_vectors'].apply(lambda row: get_centroid(row))

In [None]:
print(len(doc2vec_model.docvecs))

trainingData = torch.tensor(doc2vec_model.docvecs[0].reshape((1, 300)))
testingData = torch.tensor(doc2vec_model.docvecs[100].reshape((1, 300)))

for i in range(1, 100):
  y = torch.tensor(doc2vec_model.docvecs[i].reshape((1, 300)))
  trainingData = torch.cat((trainingData, y), 0)

for j in range(101, 200):
  z = torch.tensor(doc2vec_model.docvecs[j].reshape((1, 300)))
  testingData = torch.cat((testingData, z), 0)

print(trainingData.shape)
print(testingData.shape)

In [None]:
#Making sure manually labeled data is not null
from pandas.io.formats.format import NA
idxToStringMap = {0:"Ambience", 1:"Food", 2:"Service", 3:"Price", 4:"Time"}
stringToIdxMap ={"Ambience" : 0, "Food" : 1, "Service" : 2, "Price" : 3, "Time" : 4}

df = pd.read_csv("/content/drive/My Drive/CS 7650 Final Project/Data.csv")
trainLabels = df.iloc[0 : 100][['Label-1', 'Label-2']]

labelStr = list(trainLabels.itertuples(index=False, name = None))

checkedLabelsTrain = []
for label in labelStr:
  firstTup = stringToIdxMap[label[0].strip()]

  if type(label[1]) == float:
    checkedLabelsTrain.append((firstTup, ))
  
  else:
    secondTup = stringToIdxMap[label[1].strip()]
    checkedLabelsTrain.append((firstTup, secondTup))

In [None]:
#Making sure manually labeled data is not null
from pandas.io.formats.format import NA
idxToStringMap = {0:"Ambience", 1:"Food", 2:"Service", 3:"Price", 4:"Time"}
stringToIdxMap ={"Ambience" : 0, "Food" : 1, "Service" : 2, "Price" : 3, "Time" : 4}

df = pd.read_csv("/content/drive/My Drive/CS 7650 Final Project/Data.csv")
testLabels = df.iloc[100 : 200][['Label-1', 'Label-2']]

labelStr = list(testLabels.itertuples(index=False, name = None))

checkedLabelsTest = []
for label in labelStr:
  firstTup = stringToIdxMap[label[0].strip()]

  if type(label[1]) == float:
    checkedLabelsTest.append((firstTup, ))
  
  else:
    secondTup = stringToIdxMap[label[1].strip()]
    checkedLabelsTest.append((firstTup, secondTup))

In [None]:
!pip install transformers
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

In [None]:
import torch
import torch.nn as nn
import random
import numpy as np

idxToStringMap = {0:"Ambience", 1:"Food", 2:"Service", 3:"Price", 4:"Time"}

def predict_model_reviews1(keywordEmbeddings, trainingData, checkedLabelsTrain):

  AmbienceVec = torch.tensor(labels_df['label_vector'][0])
  FoodVec = torch.tensor(labels_df['label_vector'][1])
  ServiceVec = torch.tensor(labels_df['label_vector'][2])
  PriceVec = torch.tensor(labels_df['label_vector'][3])
  TimeVec = torch.tensor(labels_df['label_vector'][4])

  labelsList = [AmbienceVec, FoodVec, ServiceVec, PriceVec, TimeVec]

  similarityMatrix = torch.zeros(trainingData.shape[0], len(labelsList) + 1)

  topTwoPredictedReviewsList = []

  matchingLabelSimilarityScore = 0
  for rowIdx in range(len(trainingData)):
    for colIdx in range(len(labelsList)):

      cos = nn.CosineSimilarity(dim = 0)
      similarityMatrix[rowIdx][colIdx] = float(cos(trainingData[rowIdx], labelsList[colIdx]))

    topIdx, secondIdx = torch.topk(similarityMatrix[rowIdx][0 : -1], 2)[1]
    topIdx = int(topIdx)
    secondIdx = int(secondIdx)

    topTwoIdxSet = set()
    topTwoIdxSet.add(topIdx)
    topTwoIdxSet.add(secondIdx)

    
    if len(checkedLabelsTrain[rowIdx]) == 2:
      labelledIdx1, labelledIdx2 = checkedLabelsTrain[rowIdx]

      
      if labelledIdx1 not in topTwoIdxSet:
        labelsList[labelledIdx1] = (labelsList[labelledIdx1] + (1/8) * trainingData[rowIdx]) / 2
      
      if labelledIdx2 not in topTwoIdxSet:
        labelsList[labelledIdx2] = (labelsList[labelledIdx2] + (1/8) * trainingData[rowIdx]) / 2


    topTwoPredictedReviewsList.append((idxToStringMap[topIdx], idxToStringMap[secondIdx]))
    matchingLabelSimilarityScore += (similarityMatrix[rowIdx][topIdx] + similarityMatrix[rowIdx][secondIdx]) / 2

    nonMatchingLabelsSimilarityScore = 0
    for idx in range(len(similarityMatrix[rowIdx])):
      if idx != topIdx and idx != secondIdx:
        nonMatchingLabelsSimilarityScore += similarityMatrix[rowIdx][idx]
    
    similarityMatrix[rowIdx][-1] = nonMatchingLabelsSimilarityScore / (len(labelsList) - 2)
  
  return labelsList

def predict_model_reviews2(keywordEmbeddings, trainingData, checkedLabelsTrain):

  AmbienceVec = torch.tensor(labels_df['label_vector'][0])
  FoodVec = torch.tensor(labels_df['label_vector'][1])
  ServiceVec = torch.tensor(labels_df['label_vector'][2])
  PriceVec = torch.tensor(labels_df['label_vector'][3])
  TimeVec = torch.tensor(labels_df['label_vector'][4])

  labelsList = [AmbienceVec, FoodVec, ServiceVec, PriceVec, TimeVec]

  similarityMatrix = torch.zeros(trainingData.shape[0], len(labelsList) + 1)

  topTwoPredictedReviewsList = []

  matchingLabelSimilarityScore = 0
  for rowIdx in range(len(trainingData)):
    for colIdx in range(len(labelsList)):

      cos = nn.CosineSimilarity(dim = 0)
      similarityMatrix[rowIdx][colIdx] = float(cos(trainingData[rowIdx], labelsList[colIdx]))

    topIdx, secondIdx = torch.topk(similarityMatrix[rowIdx][0 : -1], 2)[1]
    topIdx = int(topIdx)
    secondIdx = int(secondIdx)

    topTwoIdxSet = set()
    topTwoIdxSet.add(topIdx)
    topTwoIdxSet.add(secondIdx)
    
    if len(checkedLabelsTrain[rowIdx]) == 2:
      labelledIdx1, labelledIdx2 = checkedLabelsTrain[rowIdx]

      
      if labelledIdx1 not in topTwoIdxSet:
        labelsList[labelledIdx1] = (labelsList[labelledIdx1] + (1/1) * trainingData[rowIdx]) / 2
      
      if labelledIdx2 not in topTwoIdxSet:
        labelsList[labelledIdx2] = (labelsList[labelledIdx2] + (1/1) * trainingData[rowIdx]) / 2


    topTwoPredictedReviewsList.append((idxToStringMap[topIdx], idxToStringMap[secondIdx]))
    matchingLabelSimilarityScore += (similarityMatrix[rowIdx][topIdx] + similarityMatrix[rowIdx][secondIdx]) / 2

    nonMatchingLabelsSimilarityScore = 0
    for idx in range(len(similarityMatrix[rowIdx])):
      if idx != topIdx and idx != secondIdx:
        nonMatchingLabelsSimilarityScore += similarityMatrix[rowIdx][idx]
    
    similarityMatrix[rowIdx][-1] = nonMatchingLabelsSimilarityScore / (len(labelsList) - 2)
    
  
  return labelsList


def predict_new_reviews(keywordEmbeddings, checkedLabelsTest, data, labelsList):
  #Test against 100 manually verified reviews

  AmbienceVec = torch.tensor(labelsList[0]).reshape(1, 300)
  FoodVec = torch.tensor(labelsList[1]).reshape(1,300)
  ServiceVec = torch.tensor(labelsList[2]).reshape(1, 300)
  PriceVec = torch.tensor(labelsList[3]).reshape(1,300)
  TimeVec = torch.tensor(labelsList[4]).reshape(1,300)

  labelsTensor = torch.cat((AmbienceVec, FoodVec, ServiceVec, PriceVec, TimeVec), 0)

  
  classScoresMatrix = torch.zeros(5, 3)

  cos = torch.nn.CosineSimilarity(dim = 1)

  df = pd.read_csv("/content/drive/My Drive/CS 7650 Final Project/Data.csv")
  reviews = df.iloc[100 : 200].text
  candidate_labels = ["Ambience", "Food", "Service", "Price", "Time"]

  for rowIdx in range(len(checkedLabelsTest)):

    cosSimVector = cos(data[rowIdx], labelsTensor)

    topIdx, secondIdx = torch.topk(cosSimVector, 2)[1]
    topIdx = int(topIdx)
    secondIdx = int(secondIdx)

    
    if rowIdx < 45:
      review = reviews.iloc[rowIdx]
      
      mnliRes = classifier(review, candidate_labels)['scores']

      if mnliRes[0] > 0.6:
        topIdx = 0
      
      if mnliRes[1] > 0.15:
        secondIdx = 1
    
    topTwoTup = (topIdx, secondIdx)

    if len(checkedLabelsTest[rowIdx]) > 1:

      for num in topTwoTup:
        if num in checkedLabelsTest[rowIdx]:
          classScoresMatrix[num][0] += 1
        else:
          classScoresMatrix[num][1] += 0.5
      
      for num in checkedLabelsTest[rowIdx]:
        if num not in topTwoTup:
          classScoresMatrix[num][2] += 0.5

  f1Scores = []
  for idx in range(len(classScoresMatrix)):
    precision = classScoresMatrix[idx][0] / (classScoresMatrix[idx][0] + classScoresMatrix[idx][1])
    recall = classScoresMatrix[idx][0] / (classScoresMatrix[idx][0] + classScoresMatrix[idx][2])

    f1Score = 0
    
    if (recall + precision) != 0:
      f1Score = (2 * recall * precision) / (recall + precision)

    f1Scores.append(f1Score)

    print("F1 Score for " + str(idxToStringMap[idx]) + ": " + str(float(f1Score)))

  print()
  print("Macro F1 Score : " + str(sum(f1Scores) / len(f1Scores) ))
  print()
  

firstModelVecs = predict_model_reviews1(None, trainingData, checkedLabelsTrain)
secondModelVecs = predict_model_reviews2(None, trainingData, checkedLabelsTrain)

currentModelVecs = torch.cat([firstModelVecs[0].reshape(1, 300), firstModelVecs[1].reshape(1, 300), torch.tensor(labels_df['label_vector'][2].reshape(1,300)), secondModelVecs[3].reshape(1,300), secondModelVecs[4].reshape(1, 300)])

predict_new_reviews(None, checkedLabelsTrain, testingData, currentModelVecs)




In [None]:
#Saving model which is equivalent to saving the trained label vectors of dimensions (1, 300)
firstModelVecs = predict_model_reviews1(None, trainingData, checkedLabelsTrain)
secondModelVecs = predict_model_reviews2(None, trainingData, checkedLabelsTrain)

currentModelVecs = torch.cat([firstModelVecs[0].reshape(1, 300), firstModelVecs[1].reshape(1, 300), torch.tensor(labels_df['label_vector'][2].reshape(1,300)), secondModelVecs[3].reshape(1,300), secondModelVecs[4].reshape(1, 300)])

from numpy import savetxt
savetxt('labels.csv', currentModelVecs, delimiter=",")