In [3]:
import pandas as pd
import numpy as np

In [4]:
df_1 = pd.read_csv('data/articles1.csv')
df_1

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."
...,...,...,...,...,...,...,...,...,...,...
49995,53287,73465,"Rex Tillerson Says Climate Change Is Real, but …",Atlantic,Robinson Meyer,2017-01-11,2017.0,1.0,,"As chairman and CEO of ExxonMobil, Rex Tillers..."
49996,53288,73466,The Biggest Intelligence Questions Raised by t...,Atlantic,Amy Zegart,2017-01-11,2017.0,1.0,,I’ve spent nearly 20 years looking at intellig...
49997,53289,73467,Trump Announces Plan That Does Little to Resol...,Atlantic,Jeremy Venook,2017-01-11,2017.0,1.0,,Donald Trump will not be taking necessary st...
49998,53290,73468,Dozens of For-Profit Colleges Could Soon Close,Atlantic,Emily DeRuy,2017-01-11,2017.0,1.0,,Dozens of colleges could be forced to close ...


### LOAD DICTIONARY AND ALL LISTS

In [5]:
import pickle

with open("lists/bow_corpus.txt", "rb") as fp:   # Unpickling
    bow_corpus = pickle.load(fp)

with open("lists/norm_corpus_bigrams.txt", "rb") as fp:   # Unpickling
    norm_corpus_bigrams = pickle.load(fp)

with open("lists/norm_papers.txt", "rb") as fp:   # Unpickling
    norm_papers = pickle.load(fp)

with open("lists/pre_papers.txt", "rb") as fp:   # Unpickling
    pre_papers = pickle.load(fp)

with open("lists/pre_titles.txt", "rb") as fp:   # Unpickling
    pre_titles = pickle.load(fp)

In [6]:
import nltk
import gensim

dictionary = gensim.corpora.Dictionary.load('models/dictionary.gensim')


### LOADING MALLET

The MALLET framework is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text. MALLET stands for MAchine Learning for LanguagE Toolkit. It was developed by Andrew McCallum along with several people at the University of Massachusetts Amherst. The MALLET topic modeling toolkit contains efficient, sampling-based implementations of Latent Dirichlet Allocation, Pachinko Allocation, and Hierarchical LDA. To use MALLET’s capabilities, we need to download the framework.

In [7]:
### loading MALLET

#!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
    
#!unzip -q mallet-2.0.8.zip

MALLET_PATH = 'mallet-2.0.8/bin/mallet'

### RUNNING MALLET

In [8]:
%%time

TOTAL_TOPICS = 10

MALLET_PATH = 'mallet-2.0.8/bin/mallet'
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, corpus=bow_corpus,
                                              num_topics=TOTAL_TOPICS, id2word=dictionary,
                                              iterations=500, workers=16)

CPU times: user 49.3 s, sys: 346 ms, total: 49.6 s
Wall time: 3min 53s


In [16]:
### save model

lda_mallet.save('models/mallet/model_'+str(TOTAL_TOPICS)+'.gensim')

### CHECKING TOPICS

In [9]:
topics = [[(term, round(wt, 3))
               for term, wt in lda_mallet.show_topic(n, topn=20)]
                   for n in range(0, TOTAL_TOPICS)]
for idx, topic in enumerate(topics):
    print('Topic #'+str(idx+1)+':')
    print([term for term, wt in topic])
    print()

Topic #1:
['time', 'day', 'year', 'home', 'city', 'water', 'people', 'back', 'place', 'work', 'migrant', 'britain', 'building', 'part', 'life', 'food', 'london', 'new_york', 'hour', 'area']

Topic #2:
['people', 'woman', 'show', 'story', 'thing', 'time', 'black', 'book', 'twitter', 'medium', 'love', 'life', 'white', 'film', 'word', 'wrote', 'make', 'america', 'man', 'live']

Topic #3:
['mr', 'trump', 'president', 'obama', 'white_house', 'administration', 'meeting', 'time', 'american', 'leader', 'day', 'washington', 'campaign', 'political', 'donald_trump', 'russia', 'office', 'united_state', 'official', 'country']

Topic #4:
['american', 'people', 'year', 'percent', 'state', 'job', 'million', 'plan', 'country', 'bill', 'policy', 'government', 'program', 'money', 'number', 'worker', 'work', 'tax', 'united_state', 'economy']

Topic #5:
['police', 'family', 'officer', 'people', 'man', 'city', 'told', 'victim', 'death', 'video', 'reported', 'gun', 'day', 'authority', 'shooting', 'time', 'ho

In [12]:
topics_df = pd.DataFrame([[term for term, wt in topic]
                              for topic in topics],
                         columns = ['Term'+str(i) for i in range(1, 21)],
                         index=['Topic '+str(t) for t in range(1, lda_mallet.num_topics+1)]).T
topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
Term1,time,people,mr,american,police,company,trump,country,child,case
Term2,day,woman,trump,people,family,china,clinton,attack,student,law
Term3,year,show,president,year,officer,million,republican,government,school,report
Term4,home,story,obama,percent,people,business,campaign,military,game,court
Term5,city,thing,white_house,state,man,year,donald_trump,muslim,year,official
Term6,water,time,administration,job,city,time,party,group,time,investigation
Term7,people,black,meeting,million,told,facebook,vote,united_state,team,state
Term8,back,book,time,plan,victim,chinese,democrat,people,university,information
Term9,place,twitter,american,country,death,car,hillary_clinton,official,woman,email
Term10,work,medium,leader,bill,video,market,candidate,war,life,statement


In [13]:
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                              for topic in topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, lda_mallet.num_topics+1)]
                         )
topics_df

Unnamed: 0,Terms per Topic
Topic1,"time, day, year, home, city, water, people, back, place, work, migrant, britain, building, part, life, food, london, new_york, hour, area"
Topic2,"people, woman, show, story, thing, time, black, book, twitter, medium, love, life, white, film, word, wrote, make, america, man, live"
Topic3,"mr, trump, president, obama, white_house, administration, meeting, time, american, leader, day, washington, campaign, political, donald_trump, russia, office, united_state, official, country"
Topic4,"american, people, year, percent, state, job, million, plan, country, bill, policy, government, program, money, number, worker, work, tax, united_state, economy"
Topic5,"police, family, officer, people, man, city, told, victim, death, video, reported, gun, day, authority, shooting, time, home, community, shot, protest"
Topic6,"company, china, million, business, year, time, facebook, chinese, car, market, technology, north_korea, product, make, employee, service, sale, system, user, apple"
Topic7,"trump, clinton, republican, campaign, donald_trump, party, vote, democrat, hillary_clinton, candidate, voter, election, state, support, cruz, conservative, people, supporter, poll, debate"
Topic8,"country, attack, government, military, muslim, group, united_state, people, official, war, isi, force, security, american, syria, leader, israel, iran, threat, international"
Topic9,"child, student, school, game, year, time, team, university, woman, life, day, dr, family, back, parent, study, player, college, sport, week"
Topic10,"case, law, report, court, official, investigation, state, information, email, statement, federal, decision, public, judge, order, claim, government, evidence, lawyer, agency"


### EVALUATING MODEL 

In [14]:
cv_coherence_model_lda_mallet = gensim.models.CoherenceModel(model=lda_mallet,
                                              corpus=bow_corpus,
                                              texts=norm_corpus_bigrams,
                                              dictionary=dictionary,
                                              coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda_mallet.get_coherence()
umass_coherence_model_lda_mallet = gensim.models.CoherenceModel(model=lda_mallet,
                                              corpus=bow_corpus,
                                              texts=norm_corpus_bigrams,
                                              dictionary=dictionary,
                                              coherence='u_mass')
avg_coherence_umass = umass_coherence_model_lda_mallet.get_coherence()

In [15]:
perplexity = -8.53533
print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

Avg. Coherence Score (Cv): 0.4721805953392117
Avg. Coherence Score (UMass): -1.593064310926516
Model Perplexity: -8.53533
