# LDA for headlines

## Imports

In [2]:
import pandas as pd
import pprint

import nltk

from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Download NLTK data
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)

True

## Load & Preprocess Data

In [3]:
# Load the data
path_to_data = "../data/processed/headlines.csv"
data = pd.read_csv(path_to_data, nrows=1000)

In [4]:
data

Unnamed: 0,org,headline
0,nytimes,Obama Affirms Climate Change Goals
1,nytimes,How Investors Are Addressing Racial Injustice
2,nytimes,Two New Books Dramatically Capture the Climate...
3,nytimes,"In the Fight Against Climate Change, Young Voi..."
4,nytimes,Climate Change Is Making It Harder for Campers...
...,...,...
995,nytimes,Boris Johnson’s ‘Global Britain’: Inspired Vis...
996,nytimes,Opinion | How Can We Fix Income and Wealth Ine...
997,nytimes,Biden Tells Officials to Prepare for Climate C...
998,nytimes,Humans Are Speeding Extinction and Altering th...


In [5]:
# fetch just the headlines from the data and convert to a list
headlines = data["headline"].tolist()

In [6]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r"\w+")
for idx in range(len(headlines)):
    # Remove punctuation and lowercase the documents.
    headlines[idx] = tokenizer.tokenize(headlines[idx])
    headlines[idx] = [w.lower() for w in headlines[idx] if len(w) > 1]

# Remove stopwords and lemmatize the documents
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

for idx in range(len(headlines)):
    headlines[idx] = [w for w in headlines[idx] if not w in stop_words]
    headlines[idx] = [lemmatizer.lemmatize(token) for token in headlines[idx]]

In [7]:
headlines

[['obama', 'affirms', 'climate', 'change', 'goal'],
 ['investor', 'addressing', 'racial', 'injustice'],
 ['two',
  'new',
  'book',
  'dramatically',
  'capture',
  'climate',
  'change',
  'crisis'],
 ['fight', 'climate', 'change', 'young', 'voice', 'speak'],
 ['climate', 'change', 'making', 'harder', 'camper', 'beat', 'heat'],
 ['inquiry',
  'prompted',
  'trump',
  'hurricane',
  'dorian',
  'claim',
  'blocked',
  'investigator',
  'say'],
 ['california', 'tourism', 'survive', 'climate', 'change'],
 ['opinion',
  'make',
  'headway',
  'climate',
  'change',
  'let',
  'change',
  'subject'],
 ['climate', 'change', 'enters', 'therapy', 'room'],
 ['fighting', 'climate', 'change', 'even', 'landing', 'punch'],
 ['response',
  'climate',
  'change',
  'missing',
  'something',
  'big',
  'scientist',
  'say'],
 ['medical',
  'journal',
  'call',
  'climate',
  'change',
  'greatest',
  'threat',
  'global',
  'public',
  'health'],
 ['policing', 'climate', 'change', 'sweeping', 'call',

In [8]:
# Create bigrams (two-word phrases) from the headlines
from gensim.models import Phrases

percent_freq = 0.02

bigram = Phrases(headlines, min_count=percent_freq * len(headlines))
for idx in range(len(headlines)):
    for token in bigram[headlines[idx]]:
        if "_" in token:
            # Token is a bigram, add to document.
            headlines[idx].append(token)

In [9]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(headlines)
corpus = [dictionary.doc2bow(doc) for doc in headlines]

## Training the LDA Model

In [10]:
# Train LDA model.

# Set training parameters.
num_topics = 10
chunksize = 200
passes = 20
iterations = 400
eval_every = None

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha="auto",
    eta="auto",
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every,
)

In [11]:
top_topics = model.top_topics(corpus)
pprint.pprint(top_topics)

[([(0.0528405, 'climate'),
   (0.050339833, 'change'),
   (0.0480397, 'global'),
   (0.039749976, 'hurricane'),
   (0.039081845, 'thinking'),
   (0.03890597, 'britain'),
   (0.03821132, 'johnson'),
   (0.03821132, 'inspired_vision'),
   (0.03821132, 'inspired'),
   (0.03821132, 'global_britain'),
   (0.03821132, 'boris_johnson'),
   (0.03821132, 'boris'),
   (0.03821132, 'wishful'),
   (0.03821132, 'wishful_thinking'),
   (0.03821132, 'vision'),
   (0.037438367, 'affected'),
   (0.037298046, 'dorian'),
   (0.036743704, 'affected_hurricane'),
   (0.007447447, 'australia'),
   (0.007059898, 'bill')],
  -13.20312060867152),
 ([(0.12130226, 'climate'),
   (0.119116925, 'change'),
   (0.025267016, 'economy'),
   (0.025012378, 'impact'),
   (0.024920572, 'biden'),
   (0.024158109, 'coronavirus'),
   (0.02363346, 'official'),
   (0.023412224, 'deal'),
   (0.022765832, 'dangerous'),
   (0.02274552, 'impact_economy'),
   (0.022444038, 'longer'),
   (0.022444038, 'age'),
   (0.022423735, 'prepar