# The data
The data set we’ll use is a list of over one million news headlines published over a period of 15 years and can be downloaded from [Kaggle](https://www.kaggle.com/therohk/million-headlines/data).

In [31]:
import pandas as pd
pd.options.display.max_rows = 10

documents = pd.read_csv('data/abcnews-date-text.csv', error_bad_lines=False)
# Viewing
display(documents)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1186013,20191231,vision of flames approaching corryong in victoria
1186014,20191231,wa police and government backflip on drug amne...
1186015,20191231,we have fears for their safety: victorian premier
1186016,20191231,when do the 20s start


# Data pre-processing
We will perform the following steps:
* Tokenization: split the text into sentences and the sentences into words. Lowercase the words and remove punctuation;
* Words that have fewer than 3 characters are removed;
* All stopwords are removed;
* Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present;
* Words are stemmed — words are reduced to their root form.

In [32]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet')

import numpy as np
np.random.seed(59)

def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

processed_docs = documents['headline_text'].map(preprocess)
# Viewing
display(processed_docs)

0                   [decid, communiti, broadcast, licenc]
1                                      [wit, awar, defam]
2                  [call, infrastructur, protect, summit]
3                             [staff, aust, strike, rise]
4                    [strike, affect, australian, travel]
                                ...                      
1186013     [vision, flame, approach, corryong, victoria]
1186014     [polic, govern, backflip, drug, amnesti, bin]
1186015                [fear, safeti, victorian, premier]
1186016                                           [start]
1186017    [yarravill, shoot, woman, dead, critic, injur]
Name: headline_text, Length: 1186018, dtype: object

# Generate Bag of Words

In [33]:
dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# Viewing
doc_sample = bow_corpus[4310]
for i in range(len(doc_sample)):
    print("Word {} \"{}\" appears {} time.".format(
        doc_sample[i][0],
        dictionary[doc_sample[i][0]],
        doc_sample[i][1])
    )

Word 162 "govt" appears 1 time.
Word 240 "group" appears 1 time.
Word 292 "vote" appears 1 time.
Word 589 "local" appears 1 time.
Word 838 "want" appears 1 time.
Word 3567 "compulsori" appears 1 time.
Word 3568 "ratepay" appears 1 time.


# Generate TF-IDF

In [34]:
from gensim import corpora, models
from pprint import pprint

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
# Viewing
pprint(corpus_tfidf[0])

[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]


# Generate LDA model

## Using Bag of Words

In [36]:
lda_model_bow = gensim.models.LdaMulticore(
    bow_corpus,
    num_topics=10,
    id2word=dictionary,
    passes=2,
    workers=2
)
# Viewing
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic {}: {}\n'.format(idx, topic))

Topic 0: 0.045*"australian" + 0.024*"woman" + 0.023*"kill" + 0.021*"china" + 0.020*"crash" + 0.019*"dead" + 0.017*"die" + 0.017*"year" + 0.014*"leav" + 0.013*"royal"

Topic 1: 0.026*"polic" + 0.025*"charg" + 0.023*"court" + 0.021*"murder" + 0.017*"donald" + 0.015*"face" + 0.015*"alleg" + 0.014*"death" + 0.014*"jail" + 0.014*"drug"

Topic 2: 0.044*"trump" + 0.043*"sydney" + 0.022*"nation" + 0.014*"final" + 0.013*"lose" + 0.012*"game" + 0.011*"open" + 0.010*"beat" + 0.010*"scott" + 0.010*"morrison"

Topic 3: 0.027*"polic" + 0.026*"victoria" + 0.022*"tasmania" + 0.020*"adelaid" + 0.020*"school" + 0.014*"tasmanian" + 0.014*"countri" + 0.014*"speak" + 0.012*"children" + 0.012*"break"

Topic 4: 0.022*"stori" + 0.013*"rural" + 0.013*"health" + 0.012*"busi" + 0.012*"indigen" + 0.012*"price" + 0.012*"power" + 0.011*"help" + 0.010*"drum" + 0.010*"servic"

Topic 5: 0.021*"market" + 0.020*"feder" + 0.018*"water" + 0.018*"miss" + 0.014*"street" + 0.013*"farm" + 0.013*"gippsland" + 0.011*"search" + 

## Using TF-IDF

In [37]:
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf,
    num_topics=10,
    id2word=dictionary,
    passes=2,
    workers=4
)
# Viewing
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic {}: {}\n'.format(idx, topic))

Topic 0: 0.011*"elect" + 0.011*"govern" + 0.007*"labor" + 0.007*"chang" + 0.007*"feder" + 0.007*"say" + 0.007*"fund" + 0.006*"climat" + 0.006*"budget" + 0.006*"liber"

Topic 1: 0.018*"countri" + 0.014*"hour" + 0.011*"royal" + 0.010*"bushfir" + 0.009*"commiss" + 0.008*"street" + 0.007*"juli" + 0.007*"health" + 0.007*"mental" + 0.007*"march"

Topic 2: 0.009*"queensland" + 0.009*"weather" + 0.008*"turnbul" + 0.008*"storm" + 0.007*"tasmania" + 0.006*"andrew" + 0.006*"shark" + 0.006*"northern" + 0.006*"violenc" + 0.005*"domest"

Topic 3: 0.018*"charg" + 0.017*"murder" + 0.014*"polic" + 0.013*"donald" + 0.012*"court" + 0.011*"alleg" + 0.010*"jail" + 0.009*"sentenc" + 0.009*"assault" + 0.009*"woman"

Topic 4: 0.016*"interview" + 0.008*"world" + 0.008*"john" + 0.006*"rugbi" + 0.006*"dairi" + 0.006*"august" + 0.006*"novemb" + 0.006*"extend" + 0.005*"australia" + 0.005*"america"

Topic 5: 0.022*"news" + 0.020*"market" + 0.019*"rural" + 0.010*"price" + 0.008*"share" + 0.008*"busi" + 0.008*"michae