In [1]:
import pandas as pd
import numpy as np

### Loading data, dictionary and lists

In [2]:
df_1 = pd.read_csv('data/articles1.csv')
df_1

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."
...,...,...,...,...,...,...,...,...,...,...
49995,53287,73465,"Rex Tillerson Says Climate Change Is Real, but …",Atlantic,Robinson Meyer,2017-01-11,2017.0,1.0,,"As chairman and CEO of ExxonMobil, Rex Tillers..."
49996,53288,73466,The Biggest Intelligence Questions Raised by t...,Atlantic,Amy Zegart,2017-01-11,2017.0,1.0,,I’ve spent nearly 20 years looking at intellig...
49997,53289,73467,Trump Announces Plan That Does Little to Resol...,Atlantic,Jeremy Venook,2017-01-11,2017.0,1.0,,Donald Trump will not be taking necessary st...
49998,53290,73468,Dozens of For-Profit Colleges Could Soon Close,Atlantic,Emily DeRuy,2017-01-11,2017.0,1.0,,Dozens of colleges could be forced to close ...


In [3]:
import pickle

with open("lists/bow_corpus.txt", "rb") as fp:   # Unpickling
    bow_corpus = pickle.load(fp)

with open("lists/norm_corpus_bigrams.txt", "rb") as fp:   # Unpickling
    norm_corpus_bigrams = pickle.load(fp)

with open("lists/norm_papers.txt", "rb") as fp:   # Unpickling
    norm_papers = pickle.load(fp)

with open("lists/pre_papers.txt", "rb") as fp:   # Unpickling
    pre_papers = pickle.load(fp)

with open("lists/pre_titles.txt", "rb") as fp:   # Unpickling
    pre_titles = pickle.load(fp)

In [4]:
import nltk
import gensim

dictionary = gensim.corpora.Dictionary.load('models/dictionary.gensim')


### GENSIM LDA TEST - 10 TOPICS

In [5]:
%%time

TOTAL_TOPICS = 10
lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary,
                                   chunksize=1740, alpha="auto", eta="auto",
                                   random_state=42, iterations=500, num_topics=TOTAL_TOPICS,
                                   passes=20, eval_every=None)


NameError: name 'topic_nums' is not defined

In [13]:
# save the LDA model 
lda_model.save('models/gensim/model_'+str(TOTAL_TOPICS)+'.gensim')

In [6]:
for topic_id, topic in lda_model.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.036*"trump" + 0.020*"president" + 0.012*"obama" + 0.009*"would" + 0.009*"white_house" + 0.009*"russia" + 0.007*"country" + 0.006*"administration" + 0.006*"american" + 0.006*"mr" + 0.006*"russian" + 0.005*"leader" + 0.005*"also" + 0.005*"united_state" + 0.005*"donald_trump" + 0.004*"official" + 0.004*"former" + 0.004*"meeting" + 0.004*"government" + 0.004*"deal"

Topic #2:
0.014*"police" + 0.009*"city" + 0.008*"family" + 0.007*"two" + 0.007*"officer" + 0.006*"people" + 0.006*"home" + 0.006*"day" + 0.005*"man" + 0.005*"child" + 0.005*"according" + 0.004*"told" + 0.004*"three" + 0.004*"shooting" + 0.004*"cnn" + 0.004*"time" + 0.004*"car" + 0.004*"state" + 0.003*"student" + 0.003*"death"

Topic #3:
0.047*"trump" + 0.021*"clinton" + 0.014*"campaign" + 0.013*"republican" + 0.012*"donald_trump" + 0.009*"say" + 0.009*"election" + 0.009*"candidate" + 0.008*"hillary_clinton" + 0.008*"voter" + 0.007*"party" + 0.007*"people" + 0.007*"vote" + 0.006*"state" + 0.006*"president" + 0.005*"c

In [7]:
topics_coherences = lda_model.top_topics(bow_corpus, topn=20)
avg_coherence_score = np.mean([item[1] for item in topics_coherences])
print('Avg. Coherence Score:', avg_coherence_score)

Avg. Coherence Score: -1.3628021129807837


In [8]:
topics_with_wts = [item[0] for item in topics_coherences]
print('LDA Topics with Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([(term, round(wt, 3)) for wt, term in topic])
    print()

LDA Topics with Weights
Topic #1:
[('people', 0.012), ('like', 0.012), ('get', 0.01), ('say', 0.01), ('would', 0.008), ('think', 0.008), ('thing', 0.008), ('time', 0.008), ('know', 0.008), ('going', 0.007), ('way', 0.007), ('want', 0.006), ('go', 0.006), ('could', 0.005), ('make', 0.005), ('even', 0.005), ('really', 0.005), ('see', 0.005), ('back', 0.005), ('much', 0.005)]

Topic #2:
[('trump', 0.047), ('clinton', 0.021), ('campaign', 0.014), ('republican', 0.013), ('donald_trump', 0.012), ('say', 0.009), ('election', 0.009), ('candidate', 0.009), ('hillary_clinton', 0.008), ('voter', 0.008), ('party', 0.007), ('people', 0.007), ('vote', 0.007), ('state', 0.006), ('president', 0.006), ('cruz', 0.005), ('democrat', 0.005), ('would', 0.005), ('poll', 0.005), ('think', 0.005)]

Topic #3:
[('trump', 0.036), ('president', 0.02), ('obama', 0.012), ('would', 0.009), ('white_house', 0.009), ('russia', 0.009), ('country', 0.007), ('administration', 0.006), ('american', 0.006), ('mr', 0.006), ('

In [9]:
print('LDA Topics without Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([term for wt, term in topic])
    print()

LDA Topics without Weights
Topic #1:
['people', 'like', 'get', 'say', 'would', 'think', 'thing', 'time', 'know', 'going', 'way', 'want', 'go', 'could', 'make', 'even', 'really', 'see', 'back', 'much']

Topic #2:
['trump', 'clinton', 'campaign', 'republican', 'donald_trump', 'say', 'election', 'candidate', 'hillary_clinton', 'voter', 'party', 'people', 'vote', 'state', 'president', 'cruz', 'democrat', 'would', 'poll', 'think']

Topic #3:
['trump', 'president', 'obama', 'would', 'white_house', 'russia', 'country', 'administration', 'american', 'mr', 'russian', 'leader', 'also', 'united_state', 'donald_trump', 'official', 'former', 'meeting', 'government', 'deal']

Topic #4:
['attack', 'people', 'isi', 'country', 'government', 'group', 'muslim', 'syria', 'also', 'force', 'killed', 'military', 'city', 'war', 'many', 'official', 'year', 'security', 'turkey', 'two']

Topic #5:
['police', 'city', 'family', 'two', 'officer', 'people', 'home', 'day', 'man', 'child', 'according', 'told', 'three'

### EVALUATE MODEL

In [10]:
cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus,
                                                 texts=norm_corpus_bigrams,
                                                     dictionary=dictionary,
                                                      coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda.get_coherence()
umass_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus,
                                               texts=norm_corpus_bigrams,
                                                     dictionary=dictionary,
                                                        coherence='u_mass')
avg_coherence_umass = umass_coherence_model_lda.get_coherence()
perplexity = lda_model.log_perplexity(bow_corpus)
print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

Avg. Coherence Score (Cv): 0.4611817097708893
Avg. Coherence Score (UMass): -1.3628021129807835
Model Perplexity: -8.525454085960877


### CHECKING TOPICS

In [11]:
topics_df = pd.DataFrame([[term for wt, term in topic]
                              for topic in topics_with_wts],
                         columns = ['Term'+str(i) for i in range(1, 21)],
                         index=['Topic '+str(t) for t in range(1, lda_model.num_topics+1)]).T
topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
Term1,people,trump,trump,attack,police,would,investigation,woman,company,company
Term2,like,clinton,president,people,city,state,report,show,year,new
Term3,get,campaign,obama,isi,family,law,official,year,million,china
Term4,say,republican,would,country,two,republican,case,also,business,apple
Term5,would,donald_trump,white_house,government,officer,american,information,time,market,also
Term6,think,say,russia,group,people,bill,time,black,percent,car
Term7,thing,election,country,muslim,home,people,email,life,according,tesla
Term8,time,candidate,administration,syria,day,policy,according,new,money,could
Term9,know,hillary_clinton,american,also,man,president,fbi,first,price,google
Term10,going,voter,mr,force,child,house,also,like,number,would


In [12]:
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for wt, term in topic])
                              for topic in topics_with_wts],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, lda_model.num_topics+1)]
                         )
topics_df

Unnamed: 0,Terms per Topic
Topic1,"people, like, get, say, would, think, thing, time, know, going, way, want, go, could, make, even, really, see, back, much"
Topic2,"trump, clinton, campaign, republican, donald_trump, say, election, candidate, hillary_clinton, voter, party, people, vote, state, president, cruz, democrat, would, poll, think"
Topic3,"trump, president, obama, would, white_house, russia, country, administration, american, mr, russian, leader, also, united_state, donald_trump, official, former, meeting, government, deal"
Topic4,"attack, people, isi, country, government, group, muslim, syria, also, force, killed, military, city, war, many, official, year, security, turkey, two"
Topic5,"police, city, family, two, officer, people, home, day, man, child, according, told, three, shooting, cnn, time, car, state, student, death"
Topic6,"would, state, law, republican, american, bill, people, policy, president, house, right, issue, also, plan, could, senate, government, federal, gop, court"
Topic7,"investigation, report, official, case, information, time, email, according, fbi, also, statement, told, former, comey, evidence, reported, cnn, intelligence, would, charge"
Topic8,"woman, show, year, also, time, black, life, new, first, like, story, white, film, wrote, star, book, people, movie, made, family"
Topic9,"company, year, million, business, market, percent, according, money, price, number, billion, job, also, new, investor, report, cost, study, economy, since"
Topic10,"company, new, china, apple, also, car, tesla, could, google, would, technology, system, facebook, north_korea, ceo, product, use, time, chinese, service"


### INTERPRETING RESULTS

In [None]:
tm_results = lda_model[bow_corpus]

In [None]:
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                     for topics in tm_results]
corpus_topics[:5]

In [None]:
corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(papers) - 67)
corpus_topic_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['Topic Desc'] = [topics_df.iloc[t[0]]['Terms per Topic'] for t in corpus_topics]
corpus_topic_df['Title'] = pre_titles
corpus_topic_df['Paper'] = pre_papers


In [None]:
pd.set_option('display.max_colwidth', 200)

topic_stats_df = corpus_topic_df.groupby('Dominant Topic').agg({
                                                'Dominant Topic': {
                                                    'Doc Count': np.size,
                                                    '% Total Docs': np.size }
                                              })
topic_stats_df = topic_stats_df['Dominant Topic'].reset_index()
topic_stats_df['% Total Docs'] = topic_stats_df['% Total Docs'].apply(lambda row: round((row*100) / len(papers), 2))
topic_stats_df['Topic Desc'] = [topics_df.iloc[t]['Terms per Topic'] for t in range(len(topic_stats_df))]
topic_stats_df

### document most dominant topic with highest contribution %

In [None]:
corpus_topic_df.sort_values(by='Contribution %', ascending=False).head(10)

In [None]:
pd.set_option('display.max_colwidth', 200)
(corpus_topic_df[corpus_topic_df['Document']
                 .isin([681, 9, 392, 1622, 17,
                        906, 996, 503, 13, 733])])

In [None]:
corpus_topic_df.groupby('Dominant Topic').apply(lambda topic_set:
                                            (topic_set.sort_values(by=['Contribution %'],
                                                   ascending=False).iloc[0]))