In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
import sklearn
import sys

from nltk.corpus import stopwords
import nltk
from gensim.models import ldamodel
import gensim.corpora

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.



['abcnews-date-text.csv']


In [2]:
# Load data and skip badly formatted lines
data = pd.read_csv("../input/abcnews-date-text.csv", error_bad_lines=False)
data_text = data[['headline_text']]
data_text = data_text.astype(str)

In [3]:
data_text = data_text[:100000]

In [4]:
# Remove stopwords
data_text['headline_text'] = data_text['headline_text'].apply(
    lambda x: [word for word in x.split(' ') if word not in stopwords.words()])

In [7]:
# Save data since removing stopwords taking long
pickle.dump(data_text, open('data_text.dat', 'wb'))

In [14]:
train_headlines = [value[0] for value in data_text.values]

In [18]:
train_headlines[:2]

[['aba', 'decides', 'community', 'broadcasting', 'licence'],
 ['act', 'fire', 'witnesses', 'must', 'aware', 'defamation']]

# Implement LDA

In [19]:
num_topics = 10
id2word = gensim.corpora.Dictionary(train_headlines)
corpus = [id2word.doc2bow(text) for text in train_headlines]
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [28]:
lda.show_topic(0, topn=10)

[('new', 0.042359415),
 ('court', 0.021600535),
 ('labor', 0.012691017),
 ('rail', 0.011862572),
 ('charges', 0.011331539),
 ('centre', 0.010978327),
 ('election', 0.010083659),
 ('faces', 0.0096198525),
 ('trade', 0.009272066),
 ('backs', 0.009002132)]

In [29]:
# Generate LDA topics
def get_lda_topics(model, num_topics):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn=20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

In [30]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,new,pay,us,open,govt,win,work,fire,water,police
1,court,lead,iraq,concerns,boost,sydney,three,police,probe,killed
2,labor,australia,new,us,urged,hopes,indigenous,mp,road,two
3,rail,england,abuse,fears,budget,workers,test,health,south,crash
4,charges,record,iraqi,decision,funds,west,appeal,group,continues,murder
5,centre,strike,sex,union,deal,pakistan,school,calls,meeting,charged
6,election,rise,child,minister,call,hit,accident,set,begins,attack
7,faces,world,former,future,vic,local,power,troops,safety,car
8,trade,ahead,report,warns,council,play,review,changes,services,woman
9,backs,wins,mayor,opposition,funding,coast,hits,search,consider,found


# Implement NMF

In [31]:
train_headlines_sentences = [' '.join(text) for text in train_headlines] 

In [33]:
# The CountVectorizer module return a matrix of size(Documents X Features), where the value of 
# a cell is going to be the number of times of the feature (word) appear in that document.
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(train_headlines_sentences)

In [37]:
# Set a TFIDF transformer, transform the counts with the model and normalize the values
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [38]:
model = NMF(n_components=num_topics, init='nndsvd')
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=10, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [40]:
def get_nmf_topics(model, n_top_words):
    feat_names = vectorizer.get_feature_names()
    word_dict = {}
    
    for i in range(num_topics):
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    return pd.DataFrame(word_dict)

In [41]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,police,us,govt,new,council,court,crash,iraq,plan,says
1,probe,iraqi,urged,zealand,rejects,murder,car,troops,water,cup
2,missing,baghdad,nsw,laws,considers,charges,killed,bush,call,world
3,search,troops,vic,president,land,charged,hospital,howard,back,win
4,investigate,soldier,wa,chief,backs,faces,two,killed,restrictions,england
5,death,open,qld,set,claims,charge,accident,pm,group,back
6,shooting,forces,funding,york,seeks,front,woman,blair,support,australia
7,hunt,attack,boost,hospital,merger,drug,injured,report,fire,report
8,murder,soldiers,accused,centre,approves,case,road,soldiers,boost,wins
9,seek,korea,fed,deal,consider,hears,highway,resolution,residents,top
