In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# log progress of LDA model training

In [2]:
import pandas as pd
import numpy as np
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore

In [3]:
df = pd.read_parquet('data/text_preprocessed.parquet')
df.head()

Unnamed: 0_level_0,title_preprocessed,summary_preprocessed
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[oil, price, could, determine, market, react, ...","[heavy, new, round, sanction, russia, ally, li..."
2,"[zoom, provide, disappointing, revenue, foreca...","[zoom, revenue, growth, continue, slow, busine..."
3,"[wall, street, rally, west, hit, russia, new, ...","[rise, end, four, day, slide, amid, worry, esc..."
4,"[weak, manufacturing, drag, gdp, growth, oil, ...","[india, economy, grow, three, month, end, dece..."
5,"[singapore, bank, halt, lend, russian, good, j...","[singapore, big, bank, restrict, trade, financ..."


In [4]:
# df['tokens'] = [np.concatenate([col1, col2]) for col1, col2 in zip(df['title_preprocessed'], df['summary_preprocessed'])]
# df.head()

In [5]:
texts = df['summary_preprocessed']
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [6]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=8, passes=12, workers=7, random_state=1)

In [7]:
for idx, topic in lda_model.print_topics(num_topics=8, num_words=30):
  print(f"Topic #{idx}: {topic}")

Topic #0: 0.023*"company" + 0.015*"announce" + 0.015*"globe" + 0.015*"newswire" + 0.011*"today" + 0.011*"new" + 0.009*"technology" + 0.009*"market" + 0.009*"global" + 0.008*"lead" + 0.006*"solution" + 0.006*"prnewswire" + 0.006*"service" + 0.006*"nasdaq" + 0.006*"industry" + 0.005*"product" + 0.004*"business" + 0.004*"platform" + 0.004*"launch" + 0.004*"base" + 0.004*"leader" + 0.004*"provider" + 0.004*"focus" + 0.004*"development" + 0.004*"data" + 0.004*"aug" + 0.003*"intelligence" + 0.003*"digital" + 0.003*"power" + 0.003*"provide"
Topic #1: 0.040*"trading" + 0.039*"day" + 0.029*"close" + 0.022*"session" + 0.022*"move" + 0.017*"late" + 0.015*"recent" + 0.015*"bitcoin" + 0.014*"price" + 0.014*"previous" + 0.014*"mark" + 0.011*"change" + 0.011*"past" + 0.010*"usd" + 0.009*"gold" + 0.007*"week" + 0.006*"prior" + 0.006*"last" + 0.006*"crypto" + 0.006*"hour" + 0.006*"musk" + 0.006*"british" + 0.005*"time" + 0.005*"current" + 0.005*"columbia" + 0.005*"high" + 0.005*"btc" + 0.005*"elon" + 0

### Results of LDA topic modelling

Topic #0: Press Releases & Corporate Announcements
- Top words: company, announce, globe, newswire, today, new, market, technology, global, lead, solution, service, prnewswire, nasdaq, industry, product, business, platform, launch, base, leader, focus, provider, development, data, aug, intelligence, digital, power, provide

Topic #1: Daily market movements
- Top words: trading, day, close, session, move, late, recent, price, bitcoin, previous, mark, past, change, usd, week, gold, last, time, game, crypto, prior, hour, high, musk, current, btc, experience, elon, coin, cryptocurrency

Topic #2: Legal Actions & Shareholder Litigation
- Top words: newswire, globe, company, announce, new, investor, firm, york, law, today, class, security, shareholder, action, file, board, may, lawsuit, share, corporation, director, july, llp, lead, right, june, jan, april, purchase, tsx

Topic #3: Investment Sentiment
- Top words: option, trade, investor, bullish, take, money, history, know, today, show, track, benzinga, bearish, lot, spend, stance, available, tesla, look, move, nvidia, apple, giant, publicly, make, individual, whether, big, institution, notice

Topic #4: Earnings & Financial Performance
- Top words: quarter, end, year, revenue, earnings, result, report, number, company, stock, per, clue, hold, ahead, deliver, lie, sale, surprise, financial, respectively, share, million, estimate, ago, third, billion, december, expect, fourth, first

Topic #5: Analyst Ratings
- Top words: share, report, rating, analyst, market, stock, sell, company, nasdaq, short, research, hold, issue, trading, buy, stocknews, free, get, nyse, recently, average, note, last, coverage, million, percent, since, year, friday, regular

Topic #6: Global Macro & Economic Policy
- Top words: say, year, business, new, bank, insider, rate, trump, president, market, accord, financial, time, china, make, india, state, week, one, month, could, may, come, federal, reuters, country, plan, first, interest, high

Topic #7: Investment Research & Insights
- Top words: stock, zacks, investor, earnings, key, estimate, strong, market, look, price, top, wall, street, high, growth, find, value, could, gain, metric, likely, momentum, beat, might, one, two, worth, attention, buy, take

In [8]:
import pyLDAvis.gensim_models
import pyLDAvis

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [16]:
# save LDA model
lda_model.save("LDA_model/LDA.model")

In [10]:
# assign the most probable topic to each document
def assign_topic(bow):
  topics = lda_model.get_document_topics(bow)
  if topics:
    return max(topics, key=lambda x: x[1])[0]
  return None

df['assigned_topic'] = [assign_topic(doc) for doc in corpus]

In [11]:
df.head()

Unnamed: 0_level_0,title_preprocessed,summary_preprocessed,assigned_topic
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[oil, price, could, determine, market, react, ...","[heavy, new, round, sanction, russia, ally, li...",6
2,"[zoom, provide, disappointing, revenue, foreca...","[zoom, revenue, growth, continue, slow, busine...",6
3,"[wall, street, rally, west, hit, russia, new, ...","[rise, end, four, day, slide, amid, worry, esc...",6
4,"[weak, manufacturing, drag, gdp, growth, oil, ...","[india, economy, grow, three, month, end, dece...",4
5,"[singapore, bank, halt, lend, russian, good, j...","[singapore, big, bank, restrict, trade, financ...",6


In [12]:
# Perform one hot encoding on the 'assigned_topic' column
df_encoded = pd.get_dummies(df, columns=['assigned_topic'], prefix='topic')
df_encoded.head()

Unnamed: 0_level_0,title_preprocessed,summary_preprocessed,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"[oil, price, could, determine, market, react, ...","[heavy, new, round, sanction, russia, ally, li...",False,False,False,False,False,False,True,False
2,"[zoom, provide, disappointing, revenue, foreca...","[zoom, revenue, growth, continue, slow, busine...",False,False,False,False,False,False,True,False
3,"[wall, street, rally, west, hit, russia, new, ...","[rise, end, four, day, slide, amid, worry, esc...",False,False,False,False,False,False,True,False
4,"[weak, manufacturing, drag, gdp, growth, oil, ...","[india, economy, grow, three, month, end, dece...",False,False,False,False,True,False,False,False
5,"[singapore, bank, halt, lend, russian, good, j...","[singapore, big, bank, restrict, trade, financ...",False,False,False,False,False,False,True,False


In [13]:
df_encoded.rename(columns={
  'topic_0': 'topic_announcements',
  'topic_1': 'topic_market_movements',
  'topic_2': 'topic_legal_actions',
  'topic_3': 'topic_investment_sentiment',
  'topic_4': 'topic_earnings',
  'topic_5': 'topic_analyst_ratings',
  'topic_6': 'topic_macro_econ',
  'topic_7': 'topic_research_insights'
}, inplace=True)
df_encoded.head()

Unnamed: 0_level_0,title_preprocessed,summary_preprocessed,topic_announcements,topic_market_movements,topic_legal_actions,topic_investment_sentiment,topic_earnings,topic_analyst_ratings,topic_macro_econ,topic_research_insights
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"[oil, price, could, determine, market, react, ...","[heavy, new, round, sanction, russia, ally, li...",False,False,False,False,False,False,True,False
2,"[zoom, provide, disappointing, revenue, foreca...","[zoom, revenue, growth, continue, slow, busine...",False,False,False,False,False,False,True,False
3,"[wall, street, rally, west, hit, russia, new, ...","[rise, end, four, day, slide, amid, worry, esc...",False,False,False,False,False,False,True,False
4,"[weak, manufacturing, drag, gdp, growth, oil, ...","[india, economy, grow, three, month, end, dece...",False,False,False,False,True,False,False,False
5,"[singapore, bank, halt, lend, russian, good, j...","[singapore, big, bank, restrict, trade, financ...",False,False,False,False,False,False,True,False


In [14]:
df_encoded.drop(columns=['title_preprocessed', 'summary_preprocessed'], inplace=True)
df_encoded.head()

Unnamed: 0_level_0,topic_announcements,topic_market_movements,topic_legal_actions,topic_investment_sentiment,topic_earnings,topic_analyst_ratings,topic_macro_econ,topic_research_insights
newsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,True,False
4,False,False,False,False,True,False,False,False
5,False,False,False,False,False,False,True,False


In [15]:
df_encoded.to_parquet('data/topic_LDA.parquet', index=False)