In [1]:
import pandas as pd
import numpy as np

In [2]:
df_1 = pd.read_csv('data/articles1.csv')

In [3]:
df_2 = pd.read_csv('data/articles2.csv')
df_2.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,53293,73471,Patriots Day Is Best When It Digs Past the Her...,Atlantic,David Sims,2017-01-11,2017.0,1.0,,"Patriots Day, Peter Berg’s new thriller that r..."
1,53294,73472,A Break in the Search for the Origin of Comple...,Atlantic,Ed Yong,2017-01-11,2017.0,1.0,,"In Norse mythology, humans and our world were ..."
2,53295,73474,Obama’s Ingenious Mention of Atticus Finch,Atlantic,Spencer Kornhaber,2017-01-11,2017.0,1.0,,“If our democracy is to work in this increasin...
3,53296,73475,"Donald Trump Meets, and Assails, the Press",Atlantic,David A. Graham,2017-01-11,2017.0,1.0,,Updated on January 11 at 5:05 p. m. In his fir...
4,53297,73476,Trump: ’I Think’ Hacking Was Russian,Atlantic,Kaveh Waddell,2017-01-11,2017.0,1.0,,Updated at 12:25 p. m. After months of equivoc...


#### let's select the first 50 new papers

In [4]:
new_titles = df_2['title'][:50].array
new_papers = df_2['content'][:50].array

In [5]:
new_titles[34]

'Trump Thanks L.L. Bean, the Latest Retailer Caught Between Him and His Critics'

In [6]:
import pickle
import nltk
import gensim

dictionary = gensim.corpora.Dictionary.load('models/dictionary.gensim')

with open("lists/bow_corpus.txt", "rb") as fp:   # Unpickling
    bow_corpus = pickle.load(fp)

with open("lists/norm_corpus_bigrams.txt", "rb") as fp:
    norm_corpus_bigrams = pickle.load(fp)

with open("lists/norm_papers.txt", "rb") as fp:
    norm_papers = pickle.load(fp)

with open("lists/pre_papers.txt", "rb") as fp:
    pre_papers = pickle.load(fp)

with open("lists/pre_titles.txt", "rb") as fp:
    pre_titles = pickle.load(fp)

### PREPROCESS NEW PAPERS

first preprcoess these new papers and extract features using the same sequence of steps we followed when building the topic models.


In [7]:
%%time
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalise_corpus(papers, titles):
    norm_papers = []
    pre_papers = []
    pre_titles = []
    for i in range(len(papers)):
        paper = papers[i]
        title = titles[i]

        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))

        if paper_tokens:
            norm_papers.append(paper_tokens)
            pre_papers.append(paper)
            pre_titles.append(title)

    return norm_papers, pre_papers, pre_titles

# we have pre_papers and pre_titles because the normalizing function removes empty papers and titles
# so for consistency the papers and titles that we perform LDA on will be kept

CPU times: user 1.98 ms, sys: 2.28 ms, total: 4.26 ms
Wall time: 4.78 ms


let's create a text wrangling and feature engineering pipeline, which should match the same steps we followed when training our topic model.


In [8]:
bigram_model = gensim.models.phrases.Phraser.load('models/bigram_model.gensim')

In [9]:
def text_preprocessing_pipeline(documents, normaliser_fn, bigram_model, titles):
    norm_docs, pre_papers, pre_titles = normaliser_fn(documents, titles)
    norm_docs_bigrams = bigram_model[norm_docs]
    return norm_docs_bigrams, pre_papers, pre_titles

def bow_features_pipeline(tokenized_docs, dictionary):
    paper_bow_features = [dictionary.doc2bow(text)
                              for text in tokenized_docs]
    return paper_bow_features

norm_new_papers, new_pre_papers, new_pre_titles = text_preprocessing_pipeline(documents=new_papers,
                                                                    normaliser_fn=normalise_corpus,
                                                                    bigram_model=bigram_model,
                                                                    titles=new_titles)

norm_bow_features = bow_features_pipeline(tokenized_docs=norm_new_papers,
                                         dictionary=dictionary)

In [10]:
print(norm_new_papers[0][:30])

['patriot', 'day', 'peter', 'berg', 'new', 'thriller', 'recreates', 'boston_marathon', 'bombing', 'ensuing', 'manhunt', 'followed', 'surprisingly', 'oblique', 'morally', 'ambiguous', 'movie', 'typically', 'straightforward', 'filmmaker', 'patriot', 'day', 'take', 'unexpectedly', 'cynical', 'view', 'chaos', 'rash', 'bureaucratic', 'infighting']


In [11]:
print(norm_bow_features[0][:30])

[(22, 1), (26, 1), (31, 3), (36, 2), (39, 2), (42, 1), (57, 1), (69, 1), (85, 2), (96, 1), (97, 1), (108, 1), (140, 1), (144, 1), (149, 1), (157, 1), (167, 1), (168, 2), (174, 2), (177, 2), (183, 1), (230, 1), (236, 1), (237, 1), (240, 1), (253, 1), (261, 2), (266, 2), (280, 1), (287, 1)]


### LOAD MALLET MODEL AND MAKE PREDICTIONS

In [12]:
TOPICS = 25

load_lda_model = gensim.models.wrappers.LdaMallet.load('models/mallet/model_'+str(TOPICS)+'.gensim')

# convert the ldaMallet to LdaModel. It was the only way to get some result with loading mallet model.
load_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(load_lda_model)

topics = [[(term, round(wt, 3))
               for term, wt in load_lda_model.show_topic(n, topn=20)]
                   for n in range(0, load_lda_model.num_topics)]

pd.set_option('display.max_colwidth', -1)

topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                              for topic in topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, load_lda_model.num_topics+1)]
                         )

In [13]:
topics_df

Unnamed: 0,Terms per Topic
Topic1,"game, team, player, sport, time, year, play, win, season, final, back, world, point, fan, athlete, match, played, league, run, day"
Topic2,"official, email, investigation, report, clinton, information, russian, fbi, intelligence, russia, campaign, comey, document, committee, evidence, government, source, time, agency, hillary_clinton"
Topic3,"military, isi, attack, syria, force, war, group, government, official, killed, islamic_state, american, syrian, iraq, people, soldier, city, civilian, fighting, army"
Topic4,"law, state, court, decision, rule, federal, case, order, supreme_court, legal, public, government, policy, issue, judge, abortion, ban, justice, statement, action"
Topic5,"water, area, people, fire, flight, plane, cnn, home, official, land, airport, state, day, passenger, storm, air, island, road, part, ship"
Topic6,"country, china, united_state, government, russia, iran, leader, north_korea, official, international, deal, russian, israel, world, putin, power, president, military, nation, japan"
Topic7,"trump, president, obama, white_house, donald_trump, administration, day, american, washington, meeting, office, policy, president_barack, national_security, presidency, friday, issue, election, told, statement"
Topic8,"muslim, attack, people, country, europe, britain, migrant, france, british, french, european, germany, london, german, islam, paris, report, uk, eu, european_union"
Topic9,"company, facebook, car, technology, user, apple, online, google, product, internet, service, employee, twitter, customer, store, site, model, brand, time, tesla"
Topic10,"million, company, money, year, business, billion, market, chinese, price, bank, financial, pay, fund, deal, percent, cost, investor, firm, paid, sale"


In [14]:
def get_topic_predictions(topic_model, corpus, topn=3):
    topic_predictions = topic_model[corpus]
    best_topics = [[(topic, round(wt, 3))
                        for topic, wt in sorted(topic_predictions[i],
                                                key=lambda row: -row[1])[:topn]]
                            for i in range(len(topic_predictions))]
    return best_topics

In [15]:
topic_preds = get_topic_predictions(topic_model=load_lda_model,
                                    corpus=norm_bow_features, topn=2)

In [16]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_pre_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, wt in item] for item in topic_preds]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Contribution %'] = [topic_wt for topic_list in 
                                        [[round(wt*100, 2) 
                                              for topic_num, wt in item] 
                                                 for item in topic_preds] 
                                    for topic_wt in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Title'] = [new_pre_titles[i-1][:200] for i in results_df.index.values]
results_df['Paper Desc'] = [new_pre_papers[i-1][:200] for i in results_df.index.values]


pd.set_option('display.max_colwidth', 300)


In [17]:
results_df

Unnamed: 0_level_0,Dominant Topics,Contribution %,Topic Desc,Title,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,16,17.9,"show, film, star, movie, year, actor, character, song, music, series, love, play, hollywood, audience, book, time, fan, award, performance, story",Patriots Day Is Best When It Digs Past the Heroism,"patriots day, peter berg’s new thriller that recreates the 2013 boston marathon bombing and the ensuing manhunt that followed it, is a surprisingly oblique, morally ambiguous movie from a typically st"
1,15,15.7,"time, work, thing, back, make, day, good, food, life, world, made, find, long, place, book, kind, art, hand, dog, small",Patriots Day Is Best When It Digs Past the Heroism,"patriots day, peter berg’s new thriller that recreates the 2013 boston marathon bombing and the ensuing manhunt that followed it, is a surprisingly oblique, morally ambiguous movie from a typically st"
2,13,24.6,"study, health, people, dr, drug, case, doctor, patient, year, research, found, medical, risk, problem, hospital, treatment, researcher, test, time, result",A Break in the Search for the Origin of Complex Life,"in norse mythology, humans and our world were created by a pantheon of gods who lived in the realm of asgard. as it turns out, these stories have a grain of truth to them. thanks to a team of scientis"
2,15,21.3,"time, work, thing, back, make, day, good, food, life, world, made, find, long, place, book, kind, art, hand, dog, small",A Break in the Search for the Origin of Complex Life,"in norse mythology, humans and our world were created by a pantheon of gods who lived in the realm of asgard. as it turns out, these stories have a grain of truth to them. thanks to a team of scientis"
3,24,34.2,"people, black, america, american, white, history, church, world, christian, community, political, word, movement, nation, speech, country, life, group, gay, god",Obama’s Ingenious Mention of Atticus Finch,"“if our democracy is to work in this increasingly diverse nation,” barack obama said in his farewell address last night, “each one of us must try to heed the advice of one of the great characters in a"
...,...,...,...,...,...
48,12,11.8,"republican, bill, democrat, house, senate, congress, ryan, senator, vote, plan, session, conservative, member, committee, people, health_care, american, obamacare, support, president",Sessions Exaggerated His Record on Gun Prosecutions,"in senator jeff sessions of alabama, opponents of new firearms restrictions have an attorney general nominee ready to implement their frequent call to “enforce the laws on the books. ” during his conf"
49,2,25.0,"official, email, investigation, report, clinton, information, russian, fbi, intelligence, russia, campaign, comey, document, committee, evidence, government, source, time, agency, hillary_clinton",Why Elections Are Now Classified as ’Critical Infrastructure’,"last friday, the same day three of the top spy agencies in the u. s. released a summary of an investigation into russia’s role in cyberattacks before the election, the department of homeland security"
49,4,10.2,"law, state, court, decision, rule, federal, case, order, supreme_court, legal, public, government, policy, issue, judge, abortion, ban, justice, statement, action",Why Elections Are Now Classified as ’Critical Infrastructure’,"last friday, the same day three of the top spy agencies in the u. s. released a summary of an investigation into russia’s role in cyberattacks before the election, the department of homeland security"
50,15,31.1,"time, work, thing, back, make, day, good, food, life, world, made, find, long, place, book, kind, art, hand, dog, small",20th Century Women Is an Ode to Female Resilience,"in a scene early on in 20th century women, dorothea (annette bening) helps her son construct a birthday cake in the kitchen, plugging candles into a mess of whipped cream and strawberries. “wait a few"


In [18]:
results_df.sort_values(by='Contribution %', ascending=False)

Unnamed: 0_level_0,Dominant Topics,Contribution %,Topic Desc,Title,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
28,13,67.4,"study, health, people, dr, drug, case, doctor, patient, year, research, found, medical, risk, problem, hospital, treatment, researcher, test, time, result",Could Cancer Drugs Treat Autism?,"five years ago, on charlie ryan’s second birthday, a big lump mysteriously formed on the side of his abdomen. at the emergency room his parents took him to, doctors suggested the lump was a hernia cau"
45,12,58.8,"republican, bill, democrat, house, senate, congress, ryan, senator, vote, plan, session, conservative, member, committee, people, health_care, american, obamacare, support, president",Obamacare Repeal Moves a Step Closer to Reality,republicans in congress can now repeal most of the affordable care act with a simple majority vote. the house on friday passed a budget resolution allowing congress to bypass a democratic filibuster i
30,13,52.7,"study, health, people, dr, drug, case, doctor, patient, year, research, found, medical, risk, problem, hospital, treatment, researcher, test, time, result",Resistance to the Antibiotic of Last Resort Is Silently Spreading,"the alarm bells sounded on november 18, 2015. antibiotic resistance is usually a crisis, one of the reasons its danger can be hard to convey. one by one, over the years, the drugs used to fight the"
38,16,51.1,"show, film, star, movie, year, actor, character, song, music, series, love, play, hollywood, audience, book, time, fan, award, performance, story",The Atlantic’s Week in Culture,"don’t miss, la la land’s nostalgia — david sims analyzes the nature of time in damien chazelle’s film, which pays homage to old hollywood while portraying the limits of worshipping the past. filmpa"
5,2,47.2,"official, email, investigation, report, clinton, information, russian, fbi, intelligence, russia, campaign, comey, document, committee, evidence, government, source, time, agency, hillary_clinton",Trump: ’I Think’ Hacking Was Russian,"updated at 12:25 p. m. after months of equivocating on the origin of cyberattacks that targeted democrats before the election, donald trump said wednesday that he thinks russia was behind the intrus"
...,...,...,...,...,...
26,14,7.3,"people, medium, thing, news, time, story, breitbart_news, twitter, show, question, interview, lot, fact, breitbart, fox_news, asked, talk, reporter, tweet, guy",Is Obamacare Just Bad Branding?,"the senate began the process of dismantling obamacare early thursday morning, passing a budget blueprint that sets the stage for a formal repeal vote, potentially within a few weeks. the step is, for"
5,7,6.8,"trump, president, obama, white_house, donald_trump, administration, day, american, washington, meeting, office, policy, president_barack, national_security, presidency, friday, issue, election, told, statement",Trump: ’I Think’ Hacking Was Russian,"updated at 12:25 p. m. after months of equivocating on the origin of cyberattacks that targeted democrats before the election, donald trump said wednesday that he thinks russia was behind the intrus"
38,24,6.8,"people, black, america, american, white, history, church, world, christian, community, political, word, movement, nation, speech, country, life, group, gay, god",The Atlantic’s Week in Culture,"don’t miss, la la land’s nostalgia — david sims analyzes the nature of time in damien chazelle’s film, which pays homage to old hollywood while portraying the limits of worshipping the past. filmpa"
45,19,5.7,"american, job, people, worker, country, work, tax, year, policy, economy, economic, make, trade, america, good, plan, world, government, system, problem",Obamacare Repeal Moves a Step Closer to Reality,republicans in congress can now repeal most of the affordable care act with a simple majority vote. the house on friday passed a budget resolution allowing congress to bypass a democratic filibuster i
