### **Installing and importing necessary libraries**

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# Installing dependencies of the saved model

!pip install -r '/content/drive/MyDrive/requirements.txt'

Restart runtime after running the previous cell

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# importing libraries
from cuml.cluster import HDBSCAN
import cuml
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.cluster import BaseCluster
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

In [None]:
# importing "dimensionality.py"- this is used for the BERTopic model
import pandas as pd
import re
import numpy as np
import os
import sys
import operator

sys.path.insert(0,'/content/drive/My Drive/')
#sys.path.append('/content/drive/My Drive/')
import dimensionality
import pre_processing
from dimensionality import *
from pre_processing import *

### **Topic Modeling**

### Sentence embeddings

In [None]:
#Importing data for training the topic model
train_set = pd.to_csv("/content/drive/MyDrive/cs_train_set.csv")
val_set = pd.to_csv("/content/drive/MyDrive/cs_val_set.csv")

In [None]:
# Converting the abstract column to a list
docs= train_set['abstract'].tolist()

# Creating embeddings of the abstracts using sentence transformers
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
cs_embeddings = model.encode(docs, show_progress_bar=True)

#Saving the embeddings
import numpy as np
with open('/content/drive/MyDrive/cs_embeddings.npy', 'wb') as f:
         np.save(f, cs_embeddings)

#Loading the embeddings
cs_embeddings = np.load('/content/drive/MyDrive/cs_embeddings.npy')

In [None]:
# Extracting vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(docs):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 15]; len(vocab)

100%|██████████| 192324/192324 [00:20<00:00, 9170.50it/s]


33019

### Dimensionality Reduction with UMAP

In [None]:
# Train model and reduce dimensionality of embeddings
umap_model = UMAP(n_components=5, n_neighbors=20, random_state=23,
                  learning_rate= 1e-09, metric="cosine", verbose=True)
reduced_embeddings = umap_model.fit_transform(cs_embeddings)

[D] [14:03:03.636557] /__w/cuml/cuml/cpp/src/umap/runner.cuh:108 n_neighbors=20
[D] [14:03:03.638238] /__w/cuml/cuml/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [14:03:13.750573] /__w/cuml/cuml/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [14:03:13.758648] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [14:03:13.758898] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.24085, 0.24804, 0.285179, 0.0307922, 0.203391, 0.0552785, 0.259437, 0.0258893, 0.032639, 0.220333, 0.0368831, 0.187842, 0.0399399, 0.0420644, 0.254643, 0.0347199, 0.165416, 0.170041, 0.0501342, 0.0148819, 0.124946, 0.035408, 0.0340924, 0.205509, 0.0475407 ]

[D] [14:03:13.760249] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 1.19209e-07, 5.96046e-08, 5.96046e-08, 0.275533, 1.78814e-07, 0.221021, 1.78814e-07, 0.165301, 0.318987, 5.96046e-08, 0.307853, 1.78814e-07, 0.339531, 0.283018, 1.78814e-07, 0.402606, 5.96046

### Clustering with HDBSCAN

In [None]:
# Find clusters of semantically similar documents
hdbscan_model = HDBSCAN(min_samples=100, gen_min_span_tree=True, prediction_data=True,
                        min_cluster_size=100, verbose=True)
cs_clusters = hdbscan_model.fit(reduced_embeddings).labels_

### Getting Topic Representations

In [None]:
# Prepare sub-models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = dimensionality.Dimensionality(reduced_embeddings)
hdbscan_model = BaseCluster()
vectorizer_model = CountVectorizer(vocabulary= vocab, stop_words="english")
representation_model = KeyBERTInspired()

# Fitting BERTopic
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        top_n_words=20,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        verbose=True
).fit(docs, embeddings=cs_embeddings, y=cs_clusters)

2023-08-16 14:05:15,430 - BERTopic - The dimensionality reduction algorithm did not contain the `y` parameter and therefore the `y` parameter was not used
2023-08-16 14:05:15,432 - BERTopic - Reduced dimensionality
2023-08-16 14:05:15,489 - BERTopic - Clustered reduced embeddings
  idf = np.log((avg_nr_samples / df)+1)


In [None]:
#saving the topic model
topic_model.save('/content/drive/MyDrive/124_topic_model', serialization="pickle")

In [None]:
#how to load the topic model
topic_model=BERTopic.load('/content/drive/MyDrive/124_topic_model')

### Creating Themes
By merging similar topics

In [None]:
new_dict= {'other': [-1,0,8,9,13,14,18,25,26,30,36,39,41,46,50,51,52,53,58,60,63,64,65,66,75,79,81,84,85,88,92,94,95,99,100,103,105,107,108,112,
                     113,117,120,121,123],
           'language': [16,35,69,70,96,124],
           'sound': [1,78,122],
            'network_communication': [2,32,48,57,77], 'programming':[5,23,68,109], 'social_networks':[3,15,27,67,87], 'blockchains':[7,106], 'VR':[12],
            'vision':[20,47,54,76,86,91,119],'recommendation_systems': [10], 'auctions':[40,110], 'memory_processors': [17,37,59,98,111,116,118],
            'cybersecurity':[4,21,28,38,59,74,82,101],'continual_reinforcement_learning':[19,89],
            'neural_networks': [6,33,43,97,104,115], 'robotics':[11,42],
            'games':[22,102,114], 'grids':[24], 'biology_chemistry':[], 'emotion_recognition':[73],'fairness':[29,56,62], 'servers':[55,61,72,90],
            'voting':[31], 'clustering':[80,83], 'ethics':[34], 'quantum_computing':[44], 'vehicles':[45,71,93]}

topics_to_merge = [a_list for a_list in new_dict.values()]
print(topics_to_merge)

[[-1, 0, 8, 9, 13, 14, 18, 25, 26, 30, 36, 39, 41, 46, 50, 51, 52, 53, 58, 60, 63, 64, 65, 66, 75, 79, 81, 84, 85, 88, 92, 94, 95, 99, 100, 103, 105, 107, 108, 112, 113, 117, 120, 121, 123], [16, 35, 69, 70, 96, 124], [1, 78, 122], [2, 32, 48, 57, 77], [5, 23, 68, 109], [3, 15, 27, 67, 87], [7, 106], [12], [20, 47, 54, 76, 86, 91, 119], [10], [40, 110], [17, 37, 59, 98, 111, 116, 118], [4, 21, 28, 38, 59, 74, 82, 101], [19, 89], [6, 33, 43, 97, 104, 115], [11, 42], [22, 102, 114], [24], [], [73], [29, 56, 62], [55, 61, 72, 90], [31], [80, 83], [34], [44], [45, 71, 93]]


In [None]:
topic_model.merge_topics(docs, topics_to_merge= topics_to_merge)

  idf = np.log((avg_nr_samples / df)+1)


Setting Theme Names

In [None]:
topic_model.set_topic_labels({-1:'other',0:'network_communication',1:'sound',2:'cybersecurity',3:'social networks',4:'programming',
5:'neural networks', 6:'memory & processors',7:'language', 8:'vision',9:'robotics', 10:'blockchain',11:'recommendation systems',
12: 'VR',13:'reinforcement learning', 14:'games', 15:'fairness', 16: 'grids', 17:'servers',18:'vehicles',  19:'voting',20:'auctions',
21:'ethics',22:'quantum computing', 23:'clustering' ,24:'emotion recognition'})

In [None]:
topic_model_df = topic_model.get_topic_info()
topic_model_df

### **Saving and Loading the Model**

In [None]:
# Saving the updated topic model
topic_model.save('/content/drive/MyDrive/24_topics_model', serialization="pickle")

In [None]:
#Loading the new model
topic_model=BERTopic.load('/content/drive/MyDrive/24_topics_model')