In [18]:
import pandas as pd
import gensim
import nltk
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/vc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
input_file = pd.read_csv('abcnews-date-text.csv')
input_file.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [4]:
data_text = input_file[['headline_text']]
documents = data_text

In [5]:
def lemmatize_token(token):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(token)

In [6]:
def preprocessing(text):
    token_arr = []
    for tokens in simple_preprocess(text,min_len=3):
        if tokens not in STOPWORDS:
            token_arr.append(lemmatize_token(tokens))
    
    return token_arr
            

In [7]:
sample_doc = documents[:5]
for val in sample_doc['headline_text']:
    print(val)
    token_arr = preprocessing(val)
    print(token_arr)

aba decides against community broadcasting licence
['aba', 'decides', 'community', 'broadcasting', 'licence']
act fire witnesses must be aware of defamation
['act', 'witness', 'aware', 'defamation']
a g calls for infrastructure protection summit
['call', 'infrastructure', 'protection', 'summit']
air nz staff in aust strike for pay rise
['air', 'staff', 'aust', 'strike', 'pay', 'rise']
air nz strike to affect australian travellers
['air', 'strike', 'affect', 'australian', 'traveller']


In [8]:
processed_docs = documents['headline_text'].map(preprocessing)
processed_docs[:5]

0    [aba, decides, community, broadcasting, licence]
1                   [act, witness, aware, defamation]
2          [call, infrastructure, protection, summit]
3               [air, staff, aust, strike, pay, rise]
4        [air, strike, affect, australian, traveller]
Name: headline_text, dtype: object

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [10]:
#for k,v in dictionary.iteritems():
#    print(k)
#    print(v)

In [11]:
dictionary.filter_extremes(no_below = 15, no_above = 0.5, keep_n = 100000)

In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]

In [15]:
processed_docs[78]

['irish', 'man', 'arrested', 'omagh', 'bombing', 'bombing', 'man']

In [14]:
processed_docs[80]

['israeli', 'force', 'push', 'gaza', 'strip']

In [16]:
bow_corpus[78]

[(295, 2), (306, 1), (307, 2), (308, 1), (309, 1)]

In [17]:
bow_corpus[80]

[(314, 1), (315, 1), (316, 1), (317, 1), (318, 1)]

In [23]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [25]:
corpus_tfidf[0]

[(0, 0.5296211440960563),
 (1, 0.5296211440960563),
 (2, 0.2805238101225404),
 (3, 0.47735329175810276),
 (4, 0.36392734749508066)]

In [28]:
lda_model_bow = models.LdaMulticore(bow_corpus,num_topics=10,id2word=dictionary,passes=2,workers=2)

In [29]:
for iloc, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(iloc, topic))

Topic: 0 
Words: 0.033*"government" + 0.021*"coast" + 0.013*"price" + 0.013*"gold" + 0.013*"say" + 0.013*"league" + 0.012*"rise" + 0.012*"live" + 0.011*"rate" + 0.010*"nrl"
Topic: 1 
Words: 0.017*"market" + 0.015*"australian" + 0.012*"state" + 0.011*"share" + 0.011*"victoria" + 0.010*"business" + 0.010*"war" + 0.010*"news" + 0.009*"bank" + 0.009*"show"
Topic: 2 
Words: 0.018*"child" + 0.016*"sex" + 0.016*"family" + 0.014*"life" + 0.013*"tasmanian" + 0.013*"new" + 0.011*"missing" + 0.011*"year" + 0.010*"royal" + 0.009*"victim"
Topic: 3 
Words: 0.031*"australia" + 0.025*"election" + 0.025*"world" + 0.023*"south" + 0.021*"sydney" + 0.012*"record" + 0.009*"say" + 0.009*"cup" + 0.009*"program" + 0.008*"refugee"
Topic: 4 
Words: 0.024*"win" + 0.022*"queensland" + 0.022*"day" + 0.019*"adelaide" + 0.018*"north" + 0.015*"country" + 0.013*"australia" + 0.013*"final" + 0.012*"west" + 0.011*"test"
Topic: 5 
Words: 0.017*"national" + 0.016*"hour" + 0.011*"power" + 0.010*"violence" + 0.010*"talk" + 

In [31]:
lda_model_tfidf = models.LdaMulticore(corpus_tfidf,num_topics=10,id2word=dictionary,passes=2,workers=2)

In [32]:
for iloc, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(iloc, topic))

Topic: 0 
Words: 0.009*"league" + 0.009*"afl" + 0.008*"world" + 0.008*"final" + 0.008*"win" + 0.008*"cup" + 0.007*"australia" + 0.007*"john" + 0.007*"malcolm" + 0.006*"rugby"
Topic: 1 
Words: 0.016*"market" + 0.013*"turnbull" + 0.009*"share" + 0.009*"podcast" + 0.007*"australian" + 0.007*"grandstand" + 0.007*"dollar" + 0.006*"october" + 0.006*"syria" + 0.006*"wednesday"
Topic: 2 
Words: 0.014*"donald" + 0.006*"peter" + 0.005*"islamic" + 0.005*"monday" + 0.005*"thursday" + 0.005*"kill" + 0.005*"tony" + 0.005*"australia" + 0.005*"australian" + 0.005*"island"
Topic: 3 
Words: 0.010*"christmas" + 0.008*"rio" + 0.007*"wall" + 0.007*"street" + 0.006*"truck" + 0.006*"november" + 0.006*"andrew" + 0.004*"white" + 0.004*"origin" + 0.004*"disability"
Topic: 4 
Words: 0.026*"trump" + 0.013*"queensland" + 0.012*"north" + 0.008*"west" + 0.007*"south" + 0.006*"ash" + 0.006*"korea" + 0.006*"australia" + 0.006*"weather" + 0.006*"coast"
Topic: 5 
Words: 0.017*"country" + 0.016*"hour" + 0.014*"rural" + 0