In [None]:
#typical imports
import random
from tqdm import tqdm
import numpy as np
import os

#other imports
# !pip install top2vec -q
# from top2vec import Top2Vec
# import fasttext


In [None]:
!pip uninstall numpy -q
!pip install numpy==1.19.5 -q

In [None]:
#train fastText

#cbow model :
ft_embeddings = fasttext.train_unsupervised('booking_sentences_processed.txt', model='cbow')

# Skipgram model :
# ft_embeddings = fasttext.train_unsupervised('fastText_training_data.txt', model='skipgram', verbose=True)


In [None]:
ft_embeddings.save_model('fast_text_embeddings.bin')

In [None]:
#train doc2vec
!wget https://raw.githubusercontent.com/akashjorss/topic_modelling/main/booking_sentences_processed.txt
documents = []
with open('booking_sentences_processed.txt', 'r') as f:
    documents = f.readlines()
# doc2vec = Top2Vec(documents, embedding_model='doc2vec', workers=8, document_ids=list(range(0,len(documents))), speed="deep-learn", keep_documents=False)
# doc2vec.save('doc2vec_top2vec.bin')

In [None]:
#Build model with USE
use = Top2Vec(documents, embedding_model='universal-sentence-encoder', workers=8, document_ids=list(range(0,len(documents))), keep_documents=False)
use.save('use_top2vec.bin')

In [None]:
#Build model with SBert
from sentence_transformers import SentenceTransformer
#download and save the sbert model
sbert_model = SentenceTransformer('stsb-mpnet-base-v2')
sbert.save("./stsb-mpnet-base-v2.pt")
#load and use SBert model
sbert = Top2Vec(documents, embedding_model_path='./stsb-mpnet-base-v2.pt', workers=8, document_ids=list(range(0,len(documents))), keep_documents=False)
sbert.save('sbert_top2vec.bin')

In [None]:
import nltk
nltk.download('punkt')

In [None]:
#Set up weights and biases to visualize the training of TSDAE
!pip install wandb -qqq
import wandb
wandb.login()
wandb.init(
  project='topic_modelling_booking_reviews',
  config={
      'pretrained_model':'bert-base-uncased',
      'model': 'TSDAE',
      'evaluation_metric': 'sts_processed',
      'dataset':'booking.com reviews (randomly sampled 100k)',
      'weight_decay':0,
      'scheduler':'constantlr',
      'optimizer_params':{'lr': 3e-5},
  }
)

In [None]:
#load processed sts data for evaluation
with open('scores.txt') as f:
  scores = f.readlines()
scores = [float(s[:-1]) for s in scores]

with open('sentences1.txt') as f:
  sentences1 = f.readlines()
sentences1 = [s[:-1] for s in sentences1]

with open('sentences2.txt') as f:
  sentences2 = f.readlines()
sentences2 = [s[:-1] for s in sentences2]


In [None]:
#train TSDAE
!pip install sentence_transformers -q
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

from sentence_transformers import evaluation
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

# Define your sentence transformer model using CLS pooling
model_name = 'bert-base-uncased' #can we use stsb-mpnet-base-v2?
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=False, pooling_mode_cls_token=True, pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

model.to('cuda')

# Define a list with sentences (1k - 100k sentences)
train_sentences = documents

# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
# train_dataset.to('cuda')

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

score_list = []
steps_list = []
import matplotlib.pyplot as plt

#callback function
def call_back(score, epoch, steps):
  wandb.log({"score":score, "steps":steps, "epoch": epoch, "num_sentences":steps*8, "loss":1-score})
  print("steps:", steps, "score:", score, "loss:", 1-score)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True,
    save_best_model=True,
    checkpoint_path = './tsdae_checkpoints/',
    checkpoint_save_steps = 500, 
    checkpoint_save_total_limit = 2,
    evaluation_steps=100,
    callback=call_back,
    evaluator=evaluator

)
wandb.finish()
# model.save('tsdae-model.pt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#build top2vec model using tsdae
tsdae = Top2Vec(documents, embedding_model_path='./tsdae.pt', workers=8, document_ids=list(range(0,len(documents))), keep_documents=False)
tsdae.save('sbert_top2vec.bin')

In [None]:
import torch
torch.save(model, "tsdae.pt")

In [None]:
"""
@article{wang-2021-TSDAE,
    title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning",
    author = "Wang, Kexin and Reimers, Nils and  Gurevych, Iryna", 
    journal= "arXiv preprint arXiv:2104.06979",
    month = "4",
    year = "2021",
    url = "https://arxiv.org/abs/2104.06979",
}
"""


In [None]:
"""The performance of TSDAE reduced over time on STS possibly because of 
we did lemm., stop word removal, etc. and the sentence structure doesn't match anymore. 
The only way I can think of right now is to apply the text processing steps to STS data set and then evaluate.""" 