In [6]:
import tomotopy as tp
from pathlib import Path
import re
import nltk
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

# import spacy
# nltk.download('omw-1.4')
# nltk.download('wordnet')


In [2]:
print(tp.isa)

avx2


In [8]:
#add redundant words to stop words
en_stop.add("said")
en_stop.add("reuters")
en_stop.add("london")
en_stop.add("new york")
en_stop.add('reuters')
en_stop.add('say')
en_stop.add('like')
en_stop.add('thing')
en_stop.add('york')
en_stop.add('new')

In [22]:
MODEL_PATH = Path("../results/models/tomotopy")
MODEL0 = "test-hlda0.tmm"
MODEL1 = "test-hlda1.tmm"
MODEL2 = "test-hlda2.tmm"
MODEL3 = "test-hlda3.tmm"

In [23]:
# load from file
mdl0 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL0}")
mdl1 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL1}")
mdl2 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL2}")
mdl3 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL3}")


In [24]:
cluster_names = {#name the clusters as seems reasonable
    0: "Science and Technology",#tech and business
    1: "Health/Science/Drugs",#health
    2: "Business/Tech/BioTech",#business
    3: "Entertainment/News"
}

In [62]:
def process_text(unseen_doc):
    """
    Preprocesses text and returns tomotopy corups.
    
    unseen_doc: str
    """

    lemmatizer = WordNetLemmatizer()  
    pat = re.compile('\w+')

    corpus = tp.utils.Corpus(
            tokenizer = tp.utils.SimpleTokenizer(stemmer=None, lowercase=True), 
            stopwords = lambda x: len(x) <= 2 or x in en_stop or x.isnumeric() or not pat.match(x) or not lemmatizer.lemmatize(x)
        )

    corpus.process(document.lower() for document in unseen_doc)
    return corpus


def get_best_model(list_of_models, document_text):
    """
    Takes list of HLDAModels and string of article text.
    Returns best HLDAModel and topic distribution of 
    document_text.
    """

    corpus = process_text(document_text)

    mdl_results=[]
    for mdl in list_of_models:
        topic_dist, ll = mdl.infer(corpus)
        mdl_results.append((topic_dist, ll))

    max_ll = max(mdl_results, key=lambda item: item[1])[1]
    max_topic_dist = max(mdl_results,key=lambda item: item[1])[0]
    max_index = mdl_results.index((max_topic_dist, max_ll))
    mdl_final = mdls[max_index]

    return mdl_final, max_topic_dist[0]

### Demo

In [48]:
# sample text
other_texts = {'txt':['Bitcoin continued to slide after a broader stock sell-off in the U.S. last week sent the cryptocurrency market into a frenzy and prompted bitcoin to plummet by roughly 10%. Bitcoin, the world’s largest digital currency by market value, was lower by about 3% at $33,438.03 late Sunday, according to data from Coin Metrics. This year, Bitcoin has been trading in a narrow range as it attempts to reclaim its highs of late 2021. The cryptocurrency is now down 50% from its peak price of $67,802.30 in November 2021. The drop comes after the blue-chip Dow Jones Industrial Average lost more than 1,000 points on Thursday and the Nasdaq plunged by 5%. Those losses marked the worst single-day drops since 2020. The Dow and Nasdaq fell again on Friday.']}
unseen = other_texts['txt']
unseen

['Bitcoin continued to slide after a broader stock sell-off in the U.S. last week sent the cryptocurrency market into a frenzy and prompted bitcoin to plummet by roughly 10%. Bitcoin, the world’s largest digital currency by market value, was lower by about 3% at $33,438.03 late Sunday, according to data from Coin Metrics. This year, Bitcoin has been trading in a narrow range as it attempts to reclaim its highs of late 2021. The cryptocurrency is now down 50% from its peak price of $67,802.30 in November 2021. The drop comes after the blue-chip Dow Jones Industrial Average lost more than 1,000 points on Thursday and the Nasdaq plunged by 5%. Those losses marked the worst single-day drops since 2020. The Dow and Nasdaq fell again on Friday.']

In [64]:
mdls = [mdl0, mdl1 ,mdl2, mdl3]
final_model, topic_dist = get_best_model(mdls, unseen)
final_model, topic_dist.path

(<tomotopy.HLDAModel at 0x16a58e9ec30>, array([  0,  13, 161]))

In [55]:
print(f"model depth: {final_model.depth}")
print(f"number of topic: {final_model.k}")


model depth: 3
number of topic: 264


------------

### DEV

Trying to get topic tree..

In [79]:
# topic_levels = [final_model.level(k) for k in range(final_model.k)] # list of all topic level
topic_levels = [k for k in range(final_model.k) if final_model.level(k) == 1] # list of the model's level 1 topics
len(topic_levels)

48

In [80]:
topic_levels

[8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183]

In [50]:
# for doc in max_cps:
#     for path in doc.path:
#       if path==0:
#         print('Root Topic is {}'.format(cluster_names.get(max_index)))
#         print('Subtopics Level {}:\n{}'.format(path,[i[0] for i in mdl_final.get_topic_words(path)]))
#       else:
#         print('Subtopics Level {}:\n{}'.format(path,[i[0] for i in mdl_final.get_topic_words(path)]))
#     print('Original Unseen Word|Probability pairs:\n{}'.format(doc.get_words(top_n=10)))

In [81]:
i = 0
for doc in max_cps:
    print(doc.path)
    i += 1

print(i)

[  0   9 479]
1


In [23]:
mdl1.depth

3

In [70]:
mdl2.children_topics(0)

array([182, 180, 183, 178, 181, 179, 177, 176, 135, 132, 130, 134, 133,
       131, 129, 128, 103, 102, 101, 100,  99,  98,  97,  96,  71,  70,
        69,  68,  67,  66,  65,  64,  39,  38,  37,  36,  35,  34,  33,
        32,  15,  14,  13,  12,  11,  10,   9,   8], dtype=uint32)

In [40]:
mdl2.children_topics(13)

array([115,  56, 161, 110, 107,  78,  25,  24], dtype=uint32)

In [42]:
mdl2.children_topics(161)

array([], dtype=uint32)

In [41]:
mdl2.children_topics(161)

array([], dtype=uint32)

In [71]:
mdl0.parent_topic(1)


-1

In [36]:
mdl0.level(161)

2

In [78]:
mdl0.parent_topic(44)

0

In [50]:
mdl1.get_topic_words(1) # mld0 has 568 topics

[('health', 6.353240314638242e-05),
 ('people', 6.353240314638242e-05),
 ('percent', 6.353240314638242e-05),
 ('year', 6.353240314638242e-05),
 ('study', 6.353240314638242e-05),
 ('according', 6.353240314638242e-05),
 ('drug', 6.353240314638242e-05),
 ('trump', 6.353240314638242e-05),
 ('years', 6.353240314638242e-05),
 ('care', 6.353240314638242e-05)]

In [55]:
mdl2.get_topic_words(0)

[('company', 0.06128843128681183),
 ('coverage', 0.04730544611811638),
 ('text', 0.04590891674160957),
 ('source', 0.04582052677869797),
 ('eikon', 0.032615356147289276),
 ('newsroom', 0.029698552563786507),
 ('gdynia', 0.024713467806577682),
 ('says', 0.016192862764000893),
 ('million', 0.011826494708657265),
 ('agreement', 0.01037693116813898)]