### Topic modeling exploration with LDA - Latent Dirichlet Allocation

In [None]:
def get_lda_objects(text):
    # stop=set(stopwords.words('english'))

    def _preprocess_text(text):
        corpus = []
        stem = PorterStemmer()
        lem = WordNetLemmatizer()
        for news in text:
            words = [w for w in word_tokenize(news) if (w not in stop)]

            words = [lem.lemmatize(w, 'v') for w in words if len(w) > 2]

            corpus.append(words)
        return corpus

    corpus = _preprocess_text(text)

    dic = gensim.corpora.Dictionary(corpus)
    bow_corpus = [dic.doc2bow(doc) for doc in corpus]

    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=5,
                                           id2word=dic,
                                           passes=10,
                                           workers=3)

    return lda_model, bow_corpus, dic


def plot_lda_vis(lda_model, bow_corpus, dic):
    pyLDAvis.enable_notebook()
    vis = gensimvis.prepare(lda_model, bow_corpus, dic)
    return vis


Topic Modelling for Titles 

In [None]:
title_lda_model, title_bow_corpus, title_dic = get_lda_objects(titles_non_stopwords_cleaned)

In [None]:
title_lda_model.show_topics()

[(0,
  '0.019*"market" + 0.017*"know" + 0.010*"dip" + 0.009*"gain" + 0.009*"stock" + 0.008*"say" + 0.006*"broader" + 0.006*"new" + 0.005*"rate" + 0.005*"buy"'),
 (1,
  '0.040*"market" + 0.016*"global" + 0.014*"know" + 0.013*"gain" + 0.011*"report" + 0.007*"lag" + 0.007*"grow" + 0.007*"stock" + 0.006*"billion" + 0.006*"growth"'),
 (2,
  '0.009*"stock" + 0.008*"new" + 0.007*"announce" + 0.006*"2022" + 0.005*"award" + 0.005*"health" + 0.004*"move" + 0.004*"capital" + 0.004*"company" + 0.004*"group"'),
 (3,
  '0.011*"stock" + 0.009*"investors" + 0.008*"year" + 0.007*"years" + 0.006*"shareholders" + 0.005*"last" + 0.005*"update" + 0.005*"new" + 0.005*"invest" + 0.004*"limit"'),
 (4,
  '0.050*"2022" + 0.037*"announce" + 0.036*"quarter" + 0.031*"third" + 0.028*"result" + 0.020*"earn" + 0.019*"financial" + 0.016*"call" + 0.016*"conference" + 0.013*"inc."')]

In [None]:
plot_lda_vis(title_lda_model, title_bow_corpus, title_dic)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


Topic Modelling for text

In [None]:
text_lda_model, text_bow_corpus, text_dic = get_lda_objects(
    text_non_stopwords_cleaned)


In [None]:
text_lda_model.show_topics()


[(0,
  '0.017*"2022" + 0.015*"million" + 0.015*"quarter" + 0.011*"share" + 0.010*"net" + 0.010*"income" + 0.010*"company" + 0.008*"september" + 0.007*"increase" + 0.007*"per"'),
 (1,
  '0.036*"market" + 0.011*"analysis" + 0.008*"report" + 0.007*"years" + 0.007*"forecast" + 0.007*"global" + 0.006*"growth" + 0.006*"table" + 0.006*"sales" + 0.006*"research"'),
 (2,
  '0.011*"company" + 0.006*"include" + 0.005*"statements" + 0.005*"2022" + 0.005*"information" + 0.005*"new" + 0.004*"forwardlooking" + 0.004*"service" + 0.004*"continue" + 0.003*"email"'),
 (3,
  '0.016*"company" + 0.011*"stock" + 0.008*"price" + 0.008*"share" + 0.006*"buy" + 0.006*"growth" + 0.006*"see" + 0.006*"article" + 0.005*"earn" + 0.005*"like"'),
 (4,
  '0.010*"say" + 0.010*"zacks" + 0.007*"stock" + 0.007*"report" + 0.007*"estimate" + 0.005*"rank" + 0.005*"company" + 0.005*"also" + 0.005*"industry" + 0.005*"research"')]

In [None]:
plot_lda_vis(text_lda_model, text_bow_corpus, text_dic)


  by='saliency', ascending=False).head(R).drop('saliency', 1)
