In [1]:
from src.config import config
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [39]:
dataset = pd.read_parquet(config.data.merged, columns=["parsed_body"])

In [41]:
dataset.shape

(419094, 1)

In [42]:
## Top2Vec

'''
Topic Modeling with Top2Vec: Minimum Viable Example
References:
[1] https://github.com/ddangelov/Top2Vec
[2] https://top2vec.readthedocs.io/_/downloads/en/stable/pdf/
'''
from top2vec import Top2Vec

# Fetch 20newsgropus dataset
docs = dataset.parsed_body.tolist()

# Create jointly embedded topic, document and word vectors
model = Top2Vec(
  docs, 
  embedding_model = 'doc2vec', # Embedding model: See [1,2] for supported models
  min_count = 2500,            # Ignore words less frequent than this value
  umap_args = None,            # Dict of custom args for UMAP
  hdbscan_args = None,         # Dict of custom argd for HDBSCAN
  workers=os.cpu_count(),
  speed="deep-lean"
  )

2024-01-21 15:04:34,979 - top2vec - INFO - Pre-processing documents for training
2024-01-21 15:32:14,151 - top2vec - INFO - Creating joint document/word embedding


In [None]:
model.save("top2vec.model")

In [None]:
model.get_num_topics()

191

In [None]:
topic_sizes, topic_nums = model.get_topic_sizes()

In [None]:
topic_sizes

array([508, 231, 227, 225, 211, 203, 188, 159, 156, 153, 143, 140, 133,
       131, 129, 124, 120, 116,  90,  87,  86,  85,  85,  83,  83,  83,
        81,  77,  75,  73,  72,  70,  70,  69,  66,  64,  63,  62,  61,
        61,  59,  58,  56,  56,  54,  54,  54,  51,  51,  51,  51,  50,
        49,  49,  48,  48,  47,  46,  46,  45,  45,  45,  44,  44,  44,
        44,  43,  43,  43,  43,  42,  42,  42,  41,  41,  41,  41,  41,
        41,  40,  40,  40,  40,  39,  39,  39,  39,  38,  38,  37,  37,
        37,  37,  36,  36,  36,  36,  36,  35,  35,  35,  35,  34,  34,
        34,  34,  34,  33,  33,  33,  32,  32,  32,  32,  32,  31,  31,
        31,  31,  31,  31,  31,  31,  31,  30,  30,  30,  30,  30,  30,
        30,  29,  29,  29,  29,  28,  28,  28,  28,  28,  28,  28,  27,
        27,  27,  27,  27,  27,  27,  26,  26,  26,  26,  26,  26,  25,
        25,  25,  25,  25,  24,  24,  24,  24,  24,  24,  24,  24,  23,
        23,  23,  23,  23,  22,  22,  22,  21,  21,  21,  21,  2

In [37]:
topic_words, word_scores, topic_nums = model.get_topics(10)

In [38]:
[x[:10] for x in topic_words]

[array(['webcast', 'replay', 'website', 'listen', 'archived', 'conference',
        'call', 'investor', 'live', 'presentation'], dtype='<U15'),
 array(['he', 'roles', 'joining', 'his', 'bachelor', 'mba', 'served', 'mr',
        'joined', 'role'], dtype='<U15'),
 array(['dividend', 'declared', 'dividends', 'payable', 'quarterly',
        'record', 'shareholders', 'directors', 'stock', 'regular'],
       dtype='<U15'),
 array(['therapeutic', 'molecule', 'disease', 'drugs', 'drug', 'therapies',
        'therapeutics', 'clinical', 'inflammatory', 'oncology'],
       dtype='<U15'),
 array(['electricity', 'utility', 'megawatts', 'electric', 'nuclear',
        'energy', 'generating', 'utilities', 'transmission', 'megawatt'],
       dtype='<U15'),
 array(['broadband', 'mobile', 'voice', 'att', 'wireless', 'coverage',
        'fi', 'wi', 'network', 'verse'], dtype='<U15'),
 array(['apparel', 'footwear', 'accessories', 'clothing', 'fashion',
        'casual', 'merchandise', 'stores', 'retailer',

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["medicine"], num_topics=5)

In [33]:
topic_nums, topic_scores, _, _ = model.get_documents_topics(list(range(len(dataset))), reduced=False, num_topics=1)


In [34]:
topic_nums

array([ 50, 158, 158, ...,  22, 125,  78])

In [35]:
topic_scores

array([0.77271056, 0.88665867, 0.863677  , ..., 0.5109895 , 0.46452522,
       0.24834464], dtype=float32)

search_documents_by_documents -> K-Means?