In [1]:
from top2vec import Top2Vec
import pandas as pd
from pathlib import Path

In [2]:

DATASET = "sample50k_health_tech"

SUBSAMPLE_PATH = Path("../data/subsamples")
MODEL_PATH = Path("../results/models")/DATASET
TOPIC_PATH = Path("../results/topics")
MODEL_PATH.mkdir(exist_ok=True)

In [3]:
df = pd.read_csv(
  f"{SUBSAMPLE_PATH}/{DATASET}.csv",
  usecols = (0,4)
  )
df.columns = ["article_id", "article"]
df = df.dropna(subset=['article'])

print(f"number of articles: {df.shape[0]}")

number of articles: 29360


### Train Top2Vec model

In [4]:
embedding_module = 'universal-sentence-encoder'
speed = 'learn'

model = Top2Vec(documents=df.article.values,  embedding_model=embedding_module, speed=speed, workers=10)

2022-05-15 15:40:09,097 - top2vec - INFO - Pre-processing documents for training
2022-05-15 15:41:40,752 - top2vec - INFO - Downloading universal-sentence-encoder model
2022-05-15 15:41:58,549 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-05-15 15:45:15,561 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-05-15 15:45:54,760 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-05-15 15:46:00,824 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


### Save model artifacts

Save model

In [5]:
model.save(f"{MODEL_PATH}/top2vec_{embedding_module}_{speed}")

Create model hierarchy and save as json.

In [6]:
# Group topics: returns list of lists storing subtopics

NUM_TOPICS_LVL1 = 20

model = Top2Vec.load(f"{MODEL_PATH}/top2vec_{embedding_module}_{speed}")
hierarchy = model.hierarchical_topic_reduction(NUM_TOPICS_LVL1)

In [9]:
# Create csv file to review and hand label level 1 topics 
# Updated hierarchy dict with handlabelled topics descriptions

NUM_TOPICS_LVS2 = model.get_num_topics()
topic_words, word_scores, topic_nums = model.get_topics(NUM_TOPICS_LVS2)

topic_list = []
for i, h in enumerate(hierarchy):
    # match level 1 topic to level 2 topic (i.e. subtopic)
    topic_levels = list(zip([i] * len(h), h))
    topic_list.extend(topic_levels)

topic_list = sorted(topic_list, key=lambda x: x[1]) # could remove this..
topic_list = pd.DataFrame(topic_list, columns = ["topic_level1", "topic_level2"])
topic_list.loc[topic_list.index, "topic_words_level2"] = pd.Series(list(topic_words))
topic_list.loc[topic_list.index, "word_scores_level2"] = pd.Series(list(word_scores))
topic_list = topic_list.set_index("topic_level2")

all_topics = pd.DataFrame(topic_words)
all_topics = all_topics.merge(topic_list, left_index=True, right_index=True)
all_topics.to_csv(f"{TOPIC_PATH}/{DATASET}_top2vec.csv")



In [40]:
# Get hand labelled topics and save to json file

topics_final = pd.read_csv(f"{TOPIC_PATH}/{DATASET}_top2vec_hand_labelled.csv")
topics_final = topics_final[['main_topic', 'topic_level1_descr','topic_level1', 'topic_level2']]
topics_final = topics_final.set_index("topic_level2")
topics_final = topics_final.merge(topic_list[["topic_words_level2", "word_scores_level2"]], left_index=True, right_index=True)
topics_final.to_json(f"{TOPIC_PATH}/{DATASET}_top2vec.json", orient='index')
