## Imports

In [1]:
!pip install -U gensim

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
import sys
from nltk.corpus import stopwords
import nltk
from gensim.models import ldamodel
import gensim.corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle
nltk.download('stopwords')

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 1.3MB/s 
Installing collected packages: gensim
  Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3


#Data Loading and Pre-processing

In [3]:
data = pd.read_csv('abcnews-date-text.csv', 
error_bad_lines=False)
# We only need the Headlines text column from the data
data_text = data[['headline_text']]

In [4]:
data_text = data_text.astype('str')
for idx in range(len(data_text)):
    
    #go through each word in each data_text row, remove stopwords, and set them on the index.
    data_text.iloc[idx]['headline_text'] = [word for word in data_text.iloc[idx]['headline_text'].split(' ') if word not in stopwords.words()]
    
    #print logs to monitor output
    if idx % 1000 == 0:
        sys.stdout.write('\rc = ' + str(idx) + ' / ' + str(len(data_text)))
#save data because it takes very long to remove stop words
pickle.dump(data_text, open('data_text.dat', 'wb'))
#get the words as an array for lda input
train_headlines = [value[0] for value in data_text.iloc[0:].values]

c = 1186000 / 1186018

#Implementing LDA

In [5]:
num_topics = 10  #initializing the no. of topics we need to cluster

In [6]:
id2word = gensim.corpora.Dictionary(train_headlines)
corpus = [id2word.doc2bow(text) for text in train_headlines]
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [7]:
def get_lda_topics(model, num_topics):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

In [8]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,news,years,election,police,health,trump,year,government,australia,fire
1,bushfire,woman,donald,crash,says,sydney,first,day,wa,abc
2,us,people,victoria,new,guilty,perth,tasmania,australian,world,queensland
3,victorian,found,stories,car,life,change,top,adelaide,2019,two
4,federal,family,charged,accused,morrison,climate,women,drum,china,back
5,emergency,canberra,royal,darwin,tasmanian,drought,hit,open,melbourne,attack
6,live,beach,shooting,dead,mental,chinese,test,ban,cup,school
7,nt,final,labor,injured,former,case,win,state,north,residents
8,death,hong,scott,help,minister,farmers,record,hobart,south,sex
9,markets,eve,commission,michael,speaks,australias,island,market,afl,brisbane


# Implementing NMF

In [9]:
#the count vectorizer module needs string inputs, not array, so they are joined with a space.
train_headlines_sentences = [' '.join(text) for text in train_headlines]

In [10]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(train_headlines_sentences)

In [11]:
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)

In [12]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)  #normalizing the TfIdf values to unit length for each row

In [13]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd')
#fit the model
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=10, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

# Generating NMF topics

In [14]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

In [15]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,interview,police,new,abc,charged,rural,fire,says,court,crash
1,extended,missing,zealand,weather,murder,news,house,council,accused,car
2,michael,search,laws,sport,death,national,crews,australia,faces,killed
3,david,probe,year,entertainment,woman,nsw,destroys,water,murder,fatal
4,james,investigate,hospital,business,stabbing,qld,threat,us,charges,woman
5,john,death,york,news,assault,reporter,home,govt,front,road
6,nrl,hunt,home,market,trial,nrn,school,plan,told,two
7,matt,shooting,deal,analysis,sydney,closer,suspicious,report,case,dead
8,ivan,officer,centre,speaks,attack,health,factory,back,high,driver
9,andrew,arrest,president,talks,two,drought,blaze,australian,hears,hospital
