In [1]:
#This notebook is for evaluating different topic models
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [6]:
%set_env WANDB_MODE=offline
%set_env WANDB_NOTEBOOK_NAME='./comparison_different_TM.ipynb'

env: WANDB_MODE=offline
env: WANDB_NOTEBOOK_NAME='./comparison_different_TM.ipynb'


In [7]:
#import required packages

#typical imports
import random
from tqdm import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import json
from multiprocessing import Pool
from time import time

#nltk imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#transformer imports
from top2vec import Top2Vec
import top2vec
#miscellaneous
import fasttext
import wandb



In [8]:
#initialize W&B -> doesn't work because of problems in online connection. Maybe solving SSL certificate error will also resolve this. 
import wandb 
# wandb.login()
wandb.init(
  project='comparison_topic_modelling_booking_reviews',
  config={
      'dataset':'booking.com reviews (randomly sampled 100k)',
      'top_k_words':20
  }
)

[34m[1mwandb[0m: You can sync this run to the cloud by running:
[34m[1mwandb[0m: [33mwandb sync /home/akash/topic_modelling/wandb/offline-run-20210617_214334-2bza8col[0m


[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.


In [9]:
#load fastText embeddings
ft_embeddings = fasttext.load_model('models/fast_text_embeddings.bin')



In [10]:
#load the models
# doc2vec = Top2Vec.load('models/doc2vec_top2vec_without_preprocessing.bin')
# use = Top2Vec.load('models/use_top2vec_without_preprocessing.bin')
# sbert = Top2Vec.load('models/sbert_top2vec_without_preprocessing.bin')
# tsdae = Top2Vec.load('models/tsdae_top2vec_without_preprocessing.bin')

In [11]:
#Some statistics about topic models
# print("Total num of topics in USE:", use.get_num_topics())
# print("Total num of topics in doc2vec:", doc2vec.get_num_topics())
# print("Total num of topics in sentence-bert:", sbert.get_num_topics())
# print("Total num of topics in TSDAE:", tsdae.get_num_topics())

In [12]:
with open("data/booking_sentences.txt", 'r') as f:
    documents = f.readlines()

In [13]:
#Function to calculate information gain of a dataset docs and top k representative words
def calc_IG(words, docs):
  """calculate information gain for a topic in topic model containing words and documents
  args:
    words (list): list of top words in the topic. 
    docs (list): list of docs in the topic
  returns:
    ig (float): information gain of this topic with respect to total independence b/w words and docs"""
  #sanity check
  assert(len(words)!=0)
  assert(len(docs)!=0)
  words = list(set(words)) #drop the redundant words
  #convert words to lower case
  words = [word.lower() for word in words]
  #tokenize the docs using nltk punkt
  tokenized_docs = [word_tokenize(doc.lower()) for doc in docs]
  #create inverted index between words and docs
  inverted_index = np.zeros((len(words),len(docs)))
  for i in range(len(words)):
    for j in range(len(docs)):
      inverted_index[i][j] = tokenized_docs[j].count(words[i])
  # print(words)
  # print(inverted_index)
  ig = 0.0
  P_d = 1/len(docs)
  P_w = 1/len(words)
  for i in range(len(words)):
    for j in range(len(docs)):
      if np.sum(inverted_index[i]) != 0:
        P_dw = inverted_index[i][j]/np.sum(inverted_index[i]) #total number of documents in which w occurs
        if P_dw != 0:
          ig += P_dw*P_w*np.log2(P_dw/P_d)
  
  return ig     

#test
# doc1 = "The big sharks of Belgium drink beer."
# doc2 = "Belgium has great beer. They drink beer all the time."
# doc3 = "They drink a lot of beer in Belgium"
# doc4 = "cha cha"
# words = ["so", "what"]
# docs = [doc1, doc2, doc3, doc4]
# calc_IG(words, docs)

In [14]:
#Function to calculate total information gain for a given model
def calculate_total_information_gain(model, k_words = 10):
  """This function calculates the total information gain of the model.
  args:
    model (Top2Vec): Top2Vec model used in encoding word and document vectors
    n_topics(int): Number of topics to reduce the model to
    k_words(int): To consider top k words (by distance) for each topic while calculating IG
  returns:
    total_IG (float): sum of information gain of all the clusters""" 
  
  #get topic words and ids
  topic_words, similarity_scores, topic_ids = model.get_topics(reduced=True)
  
  #Get the keywords of each topic
  #create a dictionary with key as topic_id and values as list of document ids
  # print("Building topic index for keywords...")
  topic_words_dict = {}
  for i in topic_ids:
    topic_words_dict[i] = topic_words[i][:k_words] #top k words

  #Get the topic number of each document
  #create a dictionary with key as topic_id and values as list of document ids
  # print("Building topic index for documents...")
  topic_docs_dict = {}
  for i in topic_ids: 
    topic_docs_dict[i] = []

  for i in range(len(documents)):
    topic_id, _, _, _ = model.get_documents_topics([i], reduced=True)
    topic_docs_dict[topic_id[0]].append(i)

  
  #get documents and keywords of each cluster and calculate the information gain
  # print("Calculating information gain...")
  total_ig = 0
  for key in topic_ids:
    words = topic_words_dict[key]
    doc_ids = topic_docs_dict[key]
    docs = np.take(documents, doc_ids)
    topic_ig = calc_IG(words, docs)
    # print("\nInformation gain for topic", key, ": ", topic_ig)
    total_ig += topic_ig

  return total_ig

In [15]:
def cos_similarity(vector1, vector2):
  """This functions finds the similarity between doc1 and doc2 using cosine similarity. 
  args:
    vector1(list-like): document embedding 1
    vector2(list-like): document embedding 2
  returns:
    cos_sim(float): cosine similarity score between vector1 and vector2"""
  cos_sim = np.dot(vector1, vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2))
  return cos_sim


In [19]:
def calculate_total_coherence(model, k_words = 20):
  """calculates total coherence for a model, based on sum of pairwise word similarity of top k words in different topics""" 
  def calculate_coherence(words):
    """calculates coherence for top-k words using fastText embeddings by summing up the pairwise cosine distance
    args:
      words (list-like): list of words
    returns:
      coherence_score (float)"""
    #assert that there are no duplicate words in the list
    num_words = len(words)
    coherence_score = 0
    for i in range(num_words):
      for j in range(num_words):
        if i != j:
          vec1 = ft_embeddings[words[i]]
          vec2 = ft_embeddings[words[j]]
          if cos_similarity(vec1, vec2) != np.nan:
            cosine_dist = cos_similarity(vec1, vec2) 
            coherence_score += cosine_dist
    coherence_score = coherence_score/num_words
    return coherence_score

  #get top k words for each topic
  #get topic words and ids
  topic_words, similarity_scores, topic_ids = model.get_topics(reduced=True)
  
  #Get the keywords of each topic
  #create a dictionary with key as topic_id and values as list of document ids
  # print("Building topic index for keywords...")
  topic_words_dict = {}
  for i in topic_ids:
    topic_words_dict[i] = topic_words[i][:k_words] #top k words
  
  avg_coherence = 0
  for topic_id in topic_ids:
    avg_coherence += calculate_coherence(topic_words_dict[topic_id])
  avg_coherence = avg_coherence/len(topic_ids)
  
  return avg_coherence
  


In [None]:
use = Top2Vec.load('models/use_top2vec.bin')

In [17]:
def evaluate_model(args):
  model_path, n_topics, model_name = args[0], args[1], args[2]

  #open the model from file. (We don't pass it as an arg to not overload the memory)
  model = Top2Vec.load(model_path)

  print("\nReducing number of topics to", n_topics, "...")
  model.hierarchical_topic_reduction(n_topics)
    
  print(f"calculating IG for {model_name} for {n_topics} topics...")
  info_gain = calculate_total_information_gain(model, k_words=20)
    
  print(f"calculating coherence for {model_name} for {n_topics} topics...")
  coherence = calculate_total_coherence(model, k_words=20)

  wandb.log({'num_topics':n_topics, model_name+'_ig':info_gain, model_name+'_coherence':coherence})

  run = []
  run.append({'metric':'IG', 'model':model_name, 'num_topics':n_topics, 'top_k_words':20, 'score':info_gain})
  run.append({'metric':'coherence', 'model':model_name, 'num_topics':n_topics, 'top_k_words':20, 'score':coherence})

  with open("comparison_different_topic_models/run3.txt", 'a') as f:
    json.dump(run, f)

  #free up the memory
  del model


In [18]:
#find total information gain and total coherence

models = {'doc2vec':'./models/doc2vec_top2vec_without_preprocessing.bin', 
          'use':'./models/use_top2vec_without_preprocessing.bin', 
          'sbert':'./models/sbert_top2vec_without_preprocessing.bin', 
          'tsdae':'./models/tsdae_top2vec_without_preprocessing.bin'}

num_topics = list(range(10,11,10))
#create args
args = []
for key in models:
  for i in num_topics:
    args.append([models[key], i, key])
start = time()
pool = Pool()
pool.map(evaluate_model, args)
stop = time()
print(f"That took {stop-start} seconds.")

wandb.finish()




Reducing number of topics to 10 ...

Reducing number of topics to 10 ...

Reducing number of topics to 10 ...

Reducing number of topics to 10 ...
calculating IG for use for 10 topics...
calculating coherence for use for 10 topics...
calculating IG for doc2vec for 10 topics...
calculating coherence for doc2vec for 10 topics...
calculating IG for sbert for 10 topics...
calculating IG for tsdae for 10 topics...
calculating coherence for sbert for 10 topics...
  cos_sim = np.dot(vector1, vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2))
calculating coherence for tsdae for 10 topics...
That took 6181.319442987442 seconds.


[34m[1mwandb[0m: You can sync this run to the cloud by running:
[34m[1mwandb[0m: [33mwandb sync /home/akash/topic_modelling/wandb/offline-run-20210617_214416-2rnak30p[0m


In [None]:
#plot results using matplotlib
df = pd.DataFrame(runs)
df.plot()

In [None]:
import matplotlib.pyplot as plt
plt.plot(x,y_use[:8], label="use")
plt.plot(x,y_sbert, label="sbert")
# plt.plot(x, y_doc2vec, label="doc2vec")
plt.legend()
plt.xlabel("Number of topics")
plt.ylabel("Average Topic Coherence")
plt.title("Coherence score for Top 20 words")
plt.show()

In [None]:
# def coherence(doc_vectors):
#   """This function calculates the topic coherence by taking pairwise cosine similarity
#   args:
#     doc_vectors(list-like): list of document vectors(512 or 768 dim)
#   returns:
#     coherence_score(float)"""
#   num_docs = len(doc_vectors)
#   coherence_score = 0
#   similarity_matrix = np.zeros((num_docs, num_docs))
#   for i in range(num_docs):
#     for j in range(num_docs):
#       if i!=j: #the similarity of doc to itself is 1 but we don't count it, so in this matrix it is 0
#         coherence_score += cos_similarity(doc_vectors[i], doc_vectors[j])

#   return coherence_score

In [None]:
#upload to W&B
#Set up weights and biases to visualize the training of TSDAE
!pip install wandb -qqq
import wandb
wandb.login()
wandb.init(
  project='comparison_of_different_TM',
  config={
      'dataset':'booking.com reviews (randomly sampled 100k)',
  }
)

In [None]:
logs = []
import pandas as pd
df = pd.DataFrame(logs)
print(df.head())
logs = {}
logs['doc2vec_ig'] = df[df['metric']=='IG'][df['model']=='doc2vec'].score.values
logs['use_ig'] = df[df['metric']=='IG'][df['model']=='use'].score.values
logs['tsdae_ig'] = df[df['metric']=='IG'][df['model']=='tsdae'].score.values
logs['sbert_ig'] =  df[df['metric']=='IG'][df['model']=='sbert'].score.values

logs['doc2vec_coherence'] = df[df['metric']=='coherence'][df['model']=='doc2vec'].score.values
logs['use_coherence'] = df[df['metric']=='coherence'][df['model']=='use'].score.values
logs['tsdae_coherence'] = df[df['metric']=='coherence'][df['model']=='tsdae'].score.values
logs['sbert_coherence'] = df[df['metric']=='coherence'][df['model']=='sbert'].score.values

for key in logs:
  num_topics = 10
  for val in logs[key]:
    wandb.log({key: val, 'num_steps':num_topics})
    num_topics +=10
wandb.finish()