In [None]:
import torch

torch.cuda.is_available()

##### References
- https://www.kaggle.com/code/maartengr/topic-modeling-arxiv-abstract-with-bertopic

In [None]:
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

### TODO visit back on list of embedding models
- paraphrase-MiniLM-L12-v2

### TODO finalize few appropriate evaluation metrics

### TODO select train test validation dataset, and export it to pq, then load using hf datasets

### TODO identify hyperparameters to tune

### TODO setup hyperparam tuning study

### TODO perform hyperparam tuning

### TODO apply dim reduction on embeddings, and visualize

### TODO start preparing comprehensive report - ReadMe.md file

### TODO create an inference service

### Load Datasets

In [None]:
from pathlib import Path

dataset_path = Path("../../../dataset/")
cache_dir = dataset_path / "cache_dir"

cache_dir.mkdir(parents=True, exist_ok=True)

In [None]:
import os

os.listdir(dataset_path)

In [None]:
from datasets import load_dataset

def load_datasets(dataset_index: int):
    train_dataset = load_dataset('parquet', data_files=[str(dataset_path / f"train_df_dataset_{dataset_index}.pq")], cache_dir=cache_dir)['train']
    validation_dataset = load_dataset('parquet', data_files=[str(dataset_path / f"validation_df_dataset_{dataset_index}.pq")], cache_dir=cache_dir)['train']
    test_dataset = load_dataset('parquet', data_files=[str(dataset_path / f"test_df_dataset_{dataset_index}.pq")], cache_dir=cache_dir)['train']
    
    return train_dataset, validation_dataset, test_dataset

In [None]:
dataset_index = 1

train_dataset, validation_dataset, test_dataset = load_datasets(dataset_index=dataset_index)

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

### TODO select best embedding model

### Extract Embeddings using Pretrained Model

In [None]:
# model_name = 'sentence-transformers/all-mpnet-base-v2'
model_name = 'distilbert-base-nli-mean-tokens'

batch_size = 384

In [None]:
def get_embeddings_filename(split_name):
    return str(
        dataset_path /
        f"split-{split_name}_dataset-{dataset_index}_model-{model_name}_embeddings.npy"
    )

def load_embeddings(split_name: str):
    embeddings_filename = get_embeddings_filename(split_name)
    embeddings = None

    if Path(embeddings_filename).exists():
        try:
            embeddings = np.load(embeddings_filename)
        except FileNotFoundError as e:
            self.logger.error(f"Expected file named {embeddings_filename} was not found")

    return embeddings

In [None]:
import numpy as np

In [None]:
train_embeddings = load_embeddings('train')
train_embeddings.shape

In [None]:
validation_embeddings = load_embeddings('validation')
validation_embeddings.shape

### Dim Reduction, Clsutering and Visualizations

In [None]:
from cuml import UMAP

umap = UMAP(n_neighbors=15, n_components=3, metric='euclidean', n_epochs=200, learning_rate=1.0, min_dist=0.1,
           random_state=65)

In [None]:
step = 30000
select = 5000
select_embeddings = None

for i in range(0, len(train_embeddings) + step, step):
    append_new = train_embeddings[i:i+select]
    
    if select_embeddings is None:
        select_embeddings = append_new
    else:
        select_embeddings = np.concatenate((select_embeddings, append_new))

In [None]:
select_embeddings.shape

In [None]:
%%time

train_transformed = umap.fit_transform(train_embeddings[:])
train_transformed.shape

In [None]:
import plotly.express as px

px.scatter_3d(data_frame=train_transformed, x=0, y=1, z=2)

### With Clustering

In [None]:
%%time
from cuml.cluster.hdbscan import HDBSCAN


hdbscan = HDBSCAN(min_cluster_size=0, cluster_selection_epsilon=0.1, metric='euclidean', cluster_selection_method='eom')
hdbscan_pred = hdbscan.fit_predict(train_embeddings)

In [None]:
np.unique(hdbscan_pred)

In [None]:
px.scatter_3d(data_frame=train_transformed, x=0, y=1, z=2, color=hdbscan_pred)

### Train Model using hyperparameters

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary

In [None]:
from gensim.parsing.preprocessing import STOPWORDS

# hyperparameters
nr_topics = 30
top_n_words = 100
min_topic_size = 10
n_gram_range = (1, 1)

# TODO inputs for hyperparameters
min_categories = 5
max_categories = 5

# count vectorizer params
max_features = 100
max_df=0.8
min_df=0.05
ngram_range=(1,1),
lowercase=True
stop_words=STOPWORDS

# metrics params
topk=10

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary


best_num_categories = min_categories
best_silhouette_score = -1.0

scores = {
    'coherence': [],
    'diversity': [],
}


def append_score(metric_name: str, score):
    scores_list = scores.get(metric_name)
    
    if scores_list is None:
        raise Exception("Invalid scoring metric")
        
    scores_list.append(score)
    scores[metric_name] = scores_list


for num_categories in range(min_categories, max_categories + 1):
    count_vectorizer = CountVectorizer(max_features=max_features, max_df=max_df, min_df=min_df,
                                       ngram_range=ngram_range, lowercase=lowercase, stop_words=list(STOPWORDS))
    
    # Create BERTopic with current number of categories
    model = BERTopic(
                nr_topics=num_categories,
                vectorizer_model=count_vectorizer,
                n_gram_range=ngram_range
            )
    
    topics, probabilities = model.fit_transform(sentences)
    
    #--------------------------------
    # for calculating coherence score
    cleaned_docs = model._preprocess_text(sentences)
    analyzer = model.vectorizer_model.build_analyzer()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    
    topics = model.get_topics()
    topics.pop(-1, None)
    
    topic_words = [
        [
            words for words, _ in model.get_topic(topic)
        ] 
        for topic in range(len(set(topics))-1)
    ]
    #--------------------------------
    
    coherence_model = CoherenceModel(topics=topic_words, 
                              texts=tokens, 
                              corpus=corpus,
                              dictionary=dictionary, 
                              coherence='c_v')
    
    coherence_score = coherence_model.get_coherence()
    append_score(metric_name='coherence', score=coherence_score)
    
    # Metric 1 - topic coherence
    # Metric 2 - topic diversity
    # Calculate Coherence & Diversity score
    # ----------------------
#     topic_words = model.get_topic_freq().index.to_list()
#     vocab_dict = count_vectorizer.vocabulary_
#     topic_words_list = list(vocab_dict.keys())
    
#     dictionary = Dictionary(topic_words_list)
#     corpus = [dictionary.doc2bow(word) for word in topic_words_list]
    
    
#     bertopic_topics = [
#         [
#             vals[0] if vals[0] in all_words else all_words[0]
#             for vals in model.get_topic(i)[:topk]
#         ]
#         for i in range(len(set(topics)) - 1)
#     ]

#     output_tm = {"topics": bertopic_topics}
    
#     npmi = Coherence(texts=sentences, topk=topk, measure="c_npmi")
#     npmi_score = npmi.score(model_output=output_tm)
#     append_score(metric_name='coherence', score=npmi_score)
    
#     topic_diversity = TopicDiversity(topk=self.topk)
#     diversity_score = topic_diversity.score(model_output=output_tm)
#     append_score(metric_name='diversity', score=diversity_score)
    
    
#     coherence_model = CoherenceModel(topics=model.get_topics(), texts=sentences)
#     coherence_model = CoherenceModel(topics=model.get_topics(), dictionary=dictionary)
    
#     coherence_score = coherence_model.get_coherence()
#     append_score(metric_name='coherence', score=coherence_score)

#     # Calculate Topic Diversity
#     topic_diversity = model.calculate_topic_diversity()
#     append_score(metric_name='diversity', score=topic_diversity)
#     # ----------------------
    
    # TODO Metric 4 - Combine all 3 metrics into one and prepare one objective function

### Hyperparameter Tuning