# LDA for each dataset

## Imports

In [1]:
import pandas as pd
import numpy as np
import nltk

from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
import gensim

# Download NLTK data
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

True

## Load and Preprocess Data

In [2]:
total_data = pd.read_parquet("../data/processed/news-consolidated-v1.parquet")

In [3]:
total_data.head()

Unnamed: 0,id,source,date,headline,embedding,url
0,0,abc,2003-02-19,a g calls for infrastructure protection summit,"[0.42550426721572876, 0.5782315135002136, 0.09...",
1,1,abc,2003-02-19,epa still trying to recover chemical clean up ...,"[0.33238619565963745, -0.3517177700996399, 0.5...",
2,2,abc,2003-02-19,expressions of interest sought to build livestock,"[0.4847770035266876, 0.10000099241733551, -0.0...",
3,3,abc,2003-02-19,iraq to pay for own rebuilding white house,"[0.4847399592399597, 0.20435450971126556, 0.19...",
4,4,abc,2003-02-19,meeting to focus on broken hill water woes,"[0.3507457375526428, 0.43837735056877136, -0.0...",


In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases


def preProcessData(dataframe):
    headlines = dataframe["headline"].tolist()

    # Same process as psb-000-mini_LDA.ipynb

    # Tokenize the documents.
    tokenizer = RegexpTokenizer(r"\w+")
    for idx in range(len(headlines)):
        # Remove punctuation and lowercase the documents.
        headlines[idx] = tokenizer.tokenize(headlines[idx])
        headlines[idx] = [w.lower() for w in headlines[idx] if len(w) > 1]

    # Remove stopwords and lemmatize the documents
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    for idx in range(len(headlines)):
        headlines[idx] = [w for w in headlines[idx] if not w in stop_words]
        headlines[idx] = [lemmatizer.lemmatize(token) for token in headlines[idx]]

    # Add bigrams
    percent_freq = 0.02

    bigram = Phrases(headlines, min_count=percent_freq * len(headlines))
    for idx in range(len(headlines)):
        for token in bigram[headlines[idx]]:
            if "_" in token:
                # Token is a bigram, add to document.
                headlines[idx].append(token)

    dictionary = Dictionary(headlines)
    corpus = [dictionary.doc2bow(doc) for doc in headlines]

    return dictionary, corpus, headlines

In [5]:
# Preprocess data

climateDB_data = preProcessData(total_data[total_data["source"] == "climate-db"])
abcNews_data = preProcessData(total_data[total_data["source"] == "abc"])
natureNews_data = preProcessData(total_data[total_data["source"] == "nature"])
newsapi_data = preProcessData(total_data[total_data["source"] == "news-api"])

total_data_processed = preProcessData(total_data)

## Create and Train LDA Model

In [6]:
# Set training parameters.
num_topics = 20
chunksize = 1000
passes = 50
iterations = 500
eval_every = None

# Make an index to word dictionary.
temp = total_data_processed[0][0]
id2word = total_data_processed[0].id2token

model = LdaModel(
    corpus=total_data_processed[1],
    id2word=id2word,
    chunksize=chunksize,
    alpha="auto",
    eta="auto",
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
)

KeyboardInterrupt: 

In [70]:
climateDB_top_topics = model.top_topics(climateDB_data[1])
abcNews_top_topics = model.top_topics(abcNews_data[1])
natureNews_top_topics = model.top_topics(natureNews_data[1])
newsapi_top_topics = model.top_topics(newsapi_data[1])
total_data_top_topics = model.top_topics(total_data_processed[1])

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary, corpus, total_data, limit=40, start=2, step=6)


## Finding top topics by running Model, saving outputs

In [69]:
# Save model
model.save("../data/models/psb-005-LDA_each_V1.model")

In [80]:
temp = pd.DataFrame(total_data_top_topics, columns=["topics", "score"])
temp

Unnamed: 0,topics,score
0,"[(0.30361468, climate), (0.24107258, change), ...",-8.193566
1,"[(0.050904073, cost), (0.047834627, forest), (...",-8.966744
2,"[(0.092042096, action), (0.0888979, plan), (0....",-9.682544
3,"[(0.16531192, say), (0.100059465, sea), (0.048...",-10.080515
4,"[(0.08891194, risk), (0.061359864, health), (0...",-10.164523
5,"[(0.100586936, world), (0.093343854, study), (...",-10.181912
6,"[(0.06676394, fire), (0.056533743, end), (0.05...",-10.288619
7,"[(0.060831357, people), (0.055473045, life), (...",-10.326685
8,"[(0.0988149, year), (0.062059756, pacific), (0...",-10.481036
9,"[(0.10420526, based), (0.060167354, north), (0...",-10.945895


In [83]:
# Save topics as parquet
climateDB_topics = pd.DataFrame(climateDB_top_topics, columns=["topics", "score"])
climateDB_topics.to_csv(
    "../data/intermediate/top_topics/psb-005-LDA_each_v1-climateDB.csv"
)

abcNews_topics = pd.DataFrame(abcNews_top_topics, columns=["topics", "score"])
abcNews_topics.to_csv("../data/intermediate/top_topics/psb-005-LDA_each_v1-abcNews.csv")

natureNews_topics = pd.DataFrame(natureNews_top_topics, columns=["topics", "score"])
natureNews_topics.to_csv(
    "../data/intermediate/top_topics/psb-005-LDA_each_v1-natureNews.csv"
)

newsapi_topics = pd.DataFrame(newsapi_top_topics, columns=["topics", "score"])
newsapi_topics.to_csv("../data/intermediate/top_topics/psb-005-LDA_each_v1-newsapi.csv")

total_data_top_topics = pd.DataFrame(total_data_top_topics, columns=["topics", "score"])
total_data_top_topics.to_csv(
    "../data/intermediate/top_topics/psb-005-LDA_each_v1-total_data.csv"
)