In [15]:
import pandas as pd
import numpy as np
import nltk

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords


# Download NLTK data
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
stemmer = PorterStemmer()

In [16]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        
        if token not in stopwords.words('english') and len(token) > 3:
            
            result.append(lemmatize_stemming(token))
    return result

In [13]:
total_data = pd.read_parquet("../data/processed/news-consolidated-v1.parquet")

In [18]:
processed = total_data["headline"].apply(preprocess)

In [21]:
dictionary = Dictionary(processed)
dictionary.filter_extremes(no_below=15, no_above=0.1)

In [24]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed]
print(bow_corpus[:5])

[[(0, 1), (1, 1), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1)], [(15, 1), (16, 1), (17, 1), (18, 1)], [(19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]]


In [25]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=10, 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers=2)

In [26]:
for idx, topic in lda_model.print_topics():
    print('Topic: {} \nWords: {}'.format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.035*"futur" + 0.025*"ocean" + 0.020*"look" + 0.017*"impact" + 0.016*"farmer" + 0.012*"mean" + 0.012*"drought" + 0.011*"seismic" + 0.011*"polit" + 0.011*"atmospher"


Topic: 1 
Words: 0.028*"report" + 0.027*"carbon" + 0.018*"north" + 0.018*"temperatur" + 0.015*"fire" + 0.014*"record" + 0.014*"ocean" + 0.014*"extrem" + 0.014*"scientist" + 0.013*"earthquak"


Topic: 2 
Words: 0.044*"call" + 0.025*"health" + 0.023*"system" + 0.016*"nation" + 0.016*"batteri" + 0.014*"river" + 0.013*"home" + 0.013*"elect" + 0.012*"influenc" + 0.012*"pacif"


Topic: 3 
Words: 0.019*"like" + 0.018*"countri" + 0.014*"show" + 0.013*"need" + 0.012*"atlant" + 0.012*"citi" + 0.011*"structur" + 0.011*"say" + 0.011*"sustain" + 0.011*"emerg"


Topic: 4 
Words: 0.030*"emiss" + 0.027*"effect" + 0.018*"model" + 0.017*"busi" + 0.015*"make" + 0.013*"scale" + 0.012*"studi" + 0.012*"carbon" + 0.011*"coal" + 0.011*"use"


Topic: 5 
Words: 0.032*"action" + 0.024*"long" + 0.016*"term" + 0.014*"social" + 0.014