In [21]:
from pathlib import Path
import json
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel

In [22]:
DATASET = "sample50k_health_tech"
MODEL_NAME = "lda_50i4p24t"
DICTIONARY_PATH = Path("../results/dictionary")
MODEL_PATH = Path("../results/models")/DATASET
PROCESSED_PATH = Path("../data/data_processed")
INFERENCE_PATH = (Path("../results/inference")/DATASET)


In [23]:
# load saved dictionary and corpus
dictionary = corpora.Dictionary.load(f"{DICTIONARY_PATH/DATASET}")
lda = LdaModel.load(f"{MODEL_PATH}/{MODEL_NAME}.model")

In [24]:
tokenized_articles = list((PROCESSED_PATH/DATASET).iterdir())

class MyCorpusInference(object):
    def __iter__(self):
        for line in open(file_t):
            yield json.loads(line)["article_id"], lda[dictionary.doc2bow(json.loads(line)["article"])]

In [25]:
topics_pred = []

for c, file_t in enumerate(tokenized_articles):
    inference_corpus_memory_friendly = MyCorpusInference()
    topics_inferred = [
        (id, vector)
        for id, vector in inference_corpus_memory_friendly
        ]
    topics_pred.extend(topics_inferred)


article_topics = pd.DataFrame(topics_pred, columns= ["article_id", "topic_list"])
article_topics["num_topics"] = article_topics["topic_list"].apply(lambda x: len(x))
article_topics[["main_topic", "main_topic_proba"]] = article_topics["topic_list"].apply(lambda x: sorted(x, key=lambda item: item[1])[-1]).apply(pd.Series)
article_topics["main_topic"] = article_topics["main_topic"].astype(int)

article_topics.to_csv(f"{INFERENCE_PATH}/{MODEL_NAME}.csv", index=False)


In [26]:
article_topics.head()

Unnamed: 0,article_id,topic_list,num_topics,main_topic,main_topic_proba
0,901965,"[(14, 0.5309663), (15, 0.1799154), (16, 0.1225...",4,14,0.530966
1,1360482,"[(0, 0.12812088), (3, 0.09798139), (10, 0.6530...",5,10,0.653085
2,2557016,"[(0, 0.017679628), (1, 0.05004615), (3, 0.0709...",7,10,0.534344
3,592207,"[(7, 0.9895776)]",1,7,0.989578
4,571034,"[(3, 0.017763685), (7, 0.43124837), (11, 0.075...",5,7,0.431248


In [27]:
unseen_doc = dictionary.doc2bow(["medicine", "desease", "forest", "hospital"])
lda[unseen_doc]

[(0, 0.010420279),
 (1, 0.010420279),
 (2, 0.010420279),
 (3, 0.010420279),
 (4, 0.010420279),
 (5, 0.010420279),
 (6, 0.010420279),
 (7, 0.010420279),
 (8, 0.010420279),
 (9, 0.010420279),
 (10, 0.010420279),
 (11, 0.48621953),
 (12, 0.010420279),
 (13, 0.010420279),
 (14, 0.010420279),
 (15, 0.010420279),
 (16, 0.010420279),
 (17, 0.010420279),
 (18, 0.010420279),
 (19, 0.010420279),
 (20, 0.28453434),
 (21, 0.010420279),
 (22, 0.010420279),
 (23, 0.010420279)]

In [28]:
# lda.show_topics(formatted=False)
lda.show_topics()

[(1,
  '0.033*"facebook" + 0.014*"company" + 0.012*"user" + 0.011*"social" + 0.011*"twitter" + 0.010*"people" + 0.010*"medium" + 0.009*"post" + 0.008*"video" + 0.008*"mr"'),
 (15,
  '0.032*"study" + 0.014*"research" + 0.012*"researcher" + 0.011*"woman" + 0.009*"find" + 0.009*"disease" + 0.009*"risk" + 0.009*"dr" + 0.009*"new" + 0.008*"brain"'),
 (12,
  '0.035*"china" + 0.024*"chinese" + 0.018*"united" + 0.016*"states" + 0.016*"country" + 0.016*"huawei" + 0.015*"government" + 0.013*"company" + 0.011*"world" + 0.010*"american"'),
 (18,
  '0.025*"health" + 0.018*"care" + 0.011*"people" + 0.010*"program" + 0.010*"year" + 0.009*"state" + 0.008*"plan" + 0.006*"service" + 0.006*"insurance" + 0.006*"pay"'),
 (19,
  '0.035*"cancer" + 0.020*"school" + 0.015*"student" + 0.014*"study" + 0.014*"death" + 0.013*"year" + 0.012*"rate" + 0.012*"report" + 0.011*"high" + 0.010*"increase"'),
 (9,
  '0.038*"company" + 0.024*"mr" + 0.013*"uber" + 0.011*"year" + 0.010*"business" + 0.010*"new" + 0.009*"billion