In [6]:
import tomotopy as tp
from pathlib import Path
import re
import nltk
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

# import spacy
# nltk.download('omw-1.4')
# nltk.download('wordnet')


In [2]:
print(tp.isa)

avx2


In [8]:
#add redundant words to stop words
en_stop.add("said")
en_stop.add("reuters")
en_stop.add("london")
en_stop.add("new york")
en_stop.add('reuters')
en_stop.add('say')
en_stop.add('like')
en_stop.add('thing')
en_stop.add('york')
en_stop.add('new')

In [22]:
MODEL_PATH = Path("../results/models/tomotopy")
MODEL0 = "test-hlda0.tmm"
MODEL1 = "test-hlda1.tmm"
MODEL2 = "test-hlda2.tmm"
MODEL3 = "test-hlda3.tmm"

In [23]:
# load from file
mdl0 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL0}")
mdl1 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL1}")
mdl2 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL2}")
mdl3 = tp.HLDAModel.load(f"{MODEL_PATH}/{MODEL3}")


In [24]:
cluster_names = {#name the clusters as seems reasonable
    0: "Science and Technology",#tech and business
    1: "Health/Science/Drugs",#health
    2: "Business/Tech/BioTech",#business
    3: "Entertainment/News"
}

In [62]:
def process_text(unseen_doc):
    """
    Preprocesses text and returns tomotopy corups.
    
    unseen_doc: str
    """

    lemmatizer = WordNetLemmatizer()  
    pat = re.compile('\w+')

    corpus = tp.utils.Corpus(
            tokenizer = tp.utils.SimpleTokenizer(stemmer=None, lowercase=True), 
            stopwords = lambda x: len(x) <= 2 or x in en_stop or x.isnumeric() or not pat.match(x) or not lemmatizer.lemmatize(x)
        )

    corpus.process(document.lower() for document in unseen_doc)
    return corpus


def get_best_model(list_of_models, document_text):
    """
    Takes list of HLDAModels and string of article text.
    Returns best HLDAModel and topic distribution of 
    document_text.
    """

    corpus = process_text(document_text)

    mdl_results=[]
    for mdl in list_of_models:
        topic_dist, ll = mdl.infer(corpus)
        mdl_results.append((topic_dist, ll))

    max_ll = max(mdl_results, key=lambda item: item[1])[1]
    max_topic_dist = max(mdl_results,key=lambda item: item[1])[0]
    max_index = mdl_results.index((max_topic_dist, max_ll))
    mdl_final = mdls[max_index]

    return mdl_final, max_topic_dist[0]

### Demo

In [48]:
# sample text
other_texts = {'txt':['Bitcoin continued to slide after a broader stock sell-off in the U.S. last week sent the cryptocurrency market into a frenzy and prompted bitcoin to plummet by roughly 10%. Bitcoin, the world’s largest digital currency by market value, was lower by about 3% at $33,438.03 late Sunday, according to data from Coin Metrics. This year, Bitcoin has been trading in a narrow range as it attempts to reclaim its highs of late 2021. The cryptocurrency is now down 50% from its peak price of $67,802.30 in November 2021. The drop comes after the blue-chip Dow Jones Industrial Average lost more than 1,000 points on Thursday and the Nasdaq plunged by 5%. Those losses marked the worst single-day drops since 2020. The Dow and Nasdaq fell again on Friday.']}
unseen = other_texts['txt']
unseen

['Bitcoin continued to slide after a broader stock sell-off in the U.S. last week sent the cryptocurrency market into a frenzy and prompted bitcoin to plummet by roughly 10%. Bitcoin, the world’s largest digital currency by market value, was lower by about 3% at $33,438.03 late Sunday, according to data from Coin Metrics. This year, Bitcoin has been trading in a narrow range as it attempts to reclaim its highs of late 2021. The cryptocurrency is now down 50% from its peak price of $67,802.30 in November 2021. The drop comes after the blue-chip Dow Jones Industrial Average lost more than 1,000 points on Thursday and the Nasdaq plunged by 5%. Those losses marked the worst single-day drops since 2020. The Dow and Nasdaq fell again on Friday.']

In [64]:
mdls = [mdl0, mdl1 ,mdl2, mdl3]
final_model, topic_dist = get_best_model(mdls, unseen)
final_model, topic_dist.path

(<tomotopy.HLDAModel at 0x16a58e9ec30>, array([  0,  13, 161]))

In [55]:
print(f"model depth: {final_model.depth}")
print(f"number of topic: {final_model.k}")

model depth: 3
number of topic: 264
