In [104]:
import pandas as pd
import numpy as np

In [105]:
df_1 = pd.read_csv('data/articles1.csv')

In [106]:
df_2 = pd.read_csv('data/articles2.csv')
df_2.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,53293,73471,Patriots Day Is Best When It Digs Past the Heroism,Atlantic,David Sims,2017-01-11,2017.0,1.0,,"Patriots Day, Peter Berg’s new thriller that recreates the 2013 Boston Marathon bombing and the ensuing manhunt that followed it, is a surprisingly oblique, morally ambiguous movie from a typically straightforward filmmaker. Patriots Day takes an unexpectedly cynical view of the chaos, rash an..."
1,53294,73472,A Break in the Search for the Origin of Complex Life,Atlantic,Ed Yong,2017-01-11,2017.0,1.0,,"In Norse mythology, humans and our world were created by a pantheon of gods who lived in the realm of Asgard. As it turns out, these stories have a grain of truth to them. Thanks to a team of scientists led by Thijs Ettema, Asgard is now also the name of a large clan of microbes. Its members, wh..."
2,53295,73474,Obama’s Ingenious Mention of Atticus Finch,Atlantic,Spencer Kornhaber,2017-01-11,2017.0,1.0,,"“If our democracy is to work in this increasingly diverse nation,” Barack Obama said in his farewell address last night, “each one of us must try to heed the advice of one of the great characters in American fiction, Atticus Finch. ” He then quoted Finch: “You never really understand a person un..."
3,53296,73475,"Donald Trump Meets, and Assails, the Press",Atlantic,David A. Graham,2017-01-11,2017.0,1.0,,"Updated on January 11 at 5:05 p. m. In his first press conference since July 2016, Donald Trump took only a few questions but made news on several fronts, saying he accepted the conclusion that Russia conducted hacks on top Democrats, bashing the press, and refusing once again to release his t..."
4,53297,73476,Trump: ’I Think’ Hacking Was Russian,Atlantic,Kaveh Waddell,2017-01-11,2017.0,1.0,,"Updated at 12:25 p. m. After months of equivocating on the origin of cyberattacks that targeted Democrats before the election, Donald Trump said Wednesday that he thinks Russia was behind the intrusions. “As for hacking, I think it was Russian,” Trump said at a press conference in New York. “B..."


#### let's select the first 50 new papers

In [107]:
new_titles = df_2['title'][:50].array
new_papers = df_2['content'][:50].array

In [108]:
new_titles[34]

'Trump Thanks L.L. Bean, the Latest Retailer Caught Between Him and His Critics'

In [109]:
import pickle
import nltk
import gensim

dictionary = gensim.corpora.Dictionary.load('models/dictionary.gensim')

with open("lists/bow_corpus.txt", "rb") as fp:   # Unpickling
    bow_corpus = pickle.load(fp)

with open("lists/norm_corpus_bigrams.txt", "rb") as fp: 
    norm_corpus_bigrams = pickle.load(fp)

with open("lists/norm_papers.txt", "rb") as fp:
    norm_papers = pickle.load(fp)

with open("lists/pre_papers.txt", "rb") as fp:   
    pre_papers = pickle.load(fp)

with open("lists/pre_titles.txt", "rb") as fp:  
    pre_titles = pickle.load(fp)

### PREPROCESS NEW PAPERS

first preprcoess these new papers and extract features using the same sequence of steps we followed when building the topic models.


In [110]:
%%time
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalise_corpus(papers, titles):
    norm_papers = []
    pre_papers = []
    pre_titles = []
    for i in range(len(papers)):
        paper = papers[i]
        title = titles[i]

        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))

        if paper_tokens:
            norm_papers.append(paper_tokens)
            pre_papers.append(paper)
            pre_titles.append(title)

    return norm_papers, pre_papers, pre_titles

# we have pre_papers and pre_titles because the normalizing function removes empty papers and titles
# so for consistency the papers and titles that we perform LDA on will be kept 

CPU times: user 810 µs, sys: 1.42 ms, total: 2.23 ms
Wall time: 2.2 ms


#### let's create a text wrangling and feature engineering pipeline, which should match the same steps we followed when training our topic model.


In [111]:
bigram_model = gensim.models.phrases.Phraser.load('models/bigram_model.gensim')

In [112]:
def text_preprocessing_pipeline(documents, normaliser_fn, bigram_model, titles):
    norm_docs, pre_papers, pre_titles = normaliser_fn(documents, titles)
    norm_docs_bigrams = bigram_model[norm_docs]
    return norm_docs_bigrams, pre_papers, pre_titles

def bow_features_pipeline(tokenized_docs, dictionary):
    paper_bow_features = [dictionary.doc2bow(text)
                              for text in tokenized_docs]
    return paper_bow_features

norm_new_papers, new_pre_papers, new_pre_titles = text_preprocessing_pipeline(documents=new_papers,
                                                                    normaliser_fn=normalise_corpus,
                                                                    bigram_model=bigram_model, 
                                                                    titles=new_titles)

norm_bow_features = bow_features_pipeline(tokenized_docs=norm_new_papers,
                                         dictionary=dictionary)

In [113]:
print(norm_new_papers[0][:30])

['patriot', 'day', 'peter', 'berg', 'new', 'thriller', 'recreates', 'boston_marathon', 'bombing', 'ensuing', 'manhunt', 'followed', 'surprisingly', 'oblique', 'morally', 'ambiguous', 'movie', 'typically', 'straightforward', 'filmmaker', 'patriot', 'day', 'take', 'unexpectedly', 'cynical', 'view', 'chaos', 'rash', 'bureaucratic', 'infighting']


In [114]:
print(norm_bow_features[0][:30])

[(22, 1), (26, 1), (31, 3), (36, 2), (39, 2), (42, 1), (57, 1), (69, 1), (85, 2), (96, 1), (97, 1), (108, 1), (140, 1), (144, 1), (149, 1), (157, 1), (167, 1), (168, 2), (174, 2), (177, 2), (183, 1), (230, 1), (236, 1), (237, 1), (240, 1), (253, 1), (261, 2), (266, 2), (280, 1), (287, 1)]


### LOAD LDA MODEL

In [115]:
TOPICS = 25

load_lda_model = gensim.models.ldamodel.LdaModel.load('models/gensim/model_'+str(TOPICS)+'.gensim')

In [116]:
topics = [[(term, round(wt, 3))
               for term, wt in load_lda_model.show_topic(n, topn=20)]
                   for n in range(0, load_lda_model.num_topics)]

pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                              for topic in topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, load_lda_model.num_topics+1)]
                         )
topics_df

Unnamed: 0,Terms per Topic
Topic1,"people, even, american, many, right, would, world, political, america, like, way, country, also, issue, power, time, new, make, fact, history"
Topic2,"isi, military, syria, russia, government, iran, country, force, war, russian, group, united_state, turkey, iraq, also, putin, syrian, official, leader, obama"
Topic3,"book, cuban, abortion, cuba, king, life, castro, india, lewis, lee, indian, canada, death, canadian, woman, planned_parenthood, jackson, hurricane, baby, american"
Topic4,"official, investigation, email, would, russia, president, fbi, russian, report, information, white_house, former, government, told_cnn, committee, intelligence, administration, comey, also, statement"
Topic5,"company, business, million, apple, tesla, new, year, market, also, product, google, sale, investor, billion, money, customer, according, employee, car, technology"
Topic6,"trump, clinton, president, donald_trump, campaign, obama, hillary_clinton, white_house, election, former, mr, speech, also, candidate, say, first, would, american, country, day"
Topic7,"facebook, news, twitter, video, medium, story, show, cnn, time, social_medium, post, tweet, user, online, also, report, website, network, ad, site"
Topic8,"attack, muslim, people, killed, isi, terrorist, airline, security, airport, terrorism, two, bomb, attacker, bombing, according, pakistan, group, paris, threat, terrorist_attack"
Topic9,"car, water, could, space, area, plane, fire, flight, cnn, storm, vehicle, home, foot, air, hour, first, system, passenger, driver, road"
Topic10,"game, team, first, player, sport, win, season, two, year, final, second, play, time, world, three, fan, point, football, ball, back"


### PREDICT NEW TOPICS OF PAPERS

In [117]:
def get_topic_predictions(topic_model, corpus, topn=3):
    topic_predictions = topic_model[corpus]
    best_topics = [[(topic, round(wt, 3)) 
                        for topic, wt in sorted(topic_predictions[i], 
                                                key=lambda row: -row[1])[:topn]] 
                            for i in range(len(topic_predictions))]
    return best_topics

In [118]:
topic_preds = get_topic_predictions(topic_model=load_lda_model, 
                                    corpus=norm_bow_features, topn=2)

#### building a results df

In [119]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_pre_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, wt in item] for item in topic_preds]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Contribution %'] = [topic_wt for topic_list in 
                                        [[round(wt*100, 2) 
                                              for topic_num, wt in item] 
                                                 for item in topic_preds] 
                                    for topic_wt in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Title'] = [new_pre_titles[i-1][:200] for i in results_df.index.values]
results_df['Paper Desc'] = [new_pre_papers[i-1][:200] for i in results_df.index.values]

In [120]:
pd.set_option('display.max_colwidth', 300)

results_df.sort_values(by='Contribution %', ascending=False)

Unnamed: 0_level_0,Dominant Topics,Contribution %,Topic Desc,Title,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,4,67.4,"official, investigation, email, would, russia, president, fbi, russian, report, information, white_house, former, government, told_cnn, committee, intelligence, administration, comey, also, statement",Trump: ’I Think’ Hacking Was Russian,"updated at 12:25 p. m. after months of equivocating on the origin of cyberattacks that targeted democrats before the election, donald trump said wednesday that he thinks russia was behind the intrus"
28,12,55.0,"study, drug, health, patient, also, case, research, may, found, doctor, food, researcher, year, people, according, medical, use, risk, dr, data",Could Cancer Drugs Treat Autism?,"five years ago, on charlie ryan’s second birthday, a big lump mysteriously formed on the side of his abdomen. at the emergency room his parents took him to, doctors suggested the lump was a hernia cau"
3,1,50.8,"people, even, american, many, right, would, world, political, america, like, way, country, also, issue, power, time, new, make, fact, history",Obama’s Ingenious Mention of Atticus Finch,"“if our democracy is to work in this increasingly diverse nation,” barack obama said in his farewell address last night, “each one of us must try to heed the advice of one of the great characters in a"
7,14,50.6,"would, year, american, plan, job, economy, policy, percent, cost, government, tax, new, million, people, could, also, worker, trade, economic, program",Obama Frames His Economic Legacy,"in barack obama’s final speech as president, he touted nearly a decade of economic improvement. “if i had told you eight years ago that america would reverse a great recession, reboot our auto industr"
45,24,46.6,"republican, state, vote, democrat, party, voter, election, would, candidate, cruz, support, poll, senate, bill, gop, conservative, house, sander, democratic, rubio",Obamacare Repeal Moves a Step Closer to Reality,republicans in congress can now repeal most of the affordable care act with a simple majority vote. the house on friday passed a budget resolution allowing congress to bypass a democratic filibuster i
...,...,...,...,...,...
5,6,13.3,"trump, clinton, president, donald_trump, campaign, obama, hillary_clinton, white_house, election, former, mr, speech, also, candidate, say, first, would, american, country, day",Trump: ’I Think’ Hacking Was Russian,"updated at 12:25 p. m. after months of equivocating on the origin of cyberattacks that targeted democrats before the election, donald trump said wednesday that he thinks russia was behind the intrus"
10,15,13.2,"say, people, like, get, think, going, know, would, thing, want, time, way, go, really, see, lot, make, back, could, look",The Atlantic Daily: Loose Ends and Legacy,"this article is part of a feature we also send out via email as the atlantic daily, a newsletter with stories, ideas, and images from the atlantic, written specially for subscribers. to sign u"
32,5,12.8,"company, business, million, apple, tesla, new, year, market, also, product, google, sale, investor, billion, money, customer, according, employee, car, technology",The Revolt of Working Parents,“there is no way you can be a good mother while achieving what i aspire.” “let’s face it. it’s a man’s world. the woman always stays home with the child. ” “it’s hard to do this job with two kids. ” t
46,18,12.1,"woman, black, white, men, female, music, song, sex, also, gay, girl, young, year, show, man, love, color, race, transgender, singer",How Victoria Aims to Connect With Young Women,"victoria, a new miniseries charting the famous queen’s early reign, premieres sunday on pbs’s masterpiece — nearly 180 years after the monarch ascended the british throne and five days before donald"


Looking at the generated topics for the new, previously unseen papers, I would say our model has done an excellent job!

### PREDICTING WITH MALLET

In [122]:
load_lda_model

<gensim.models.ldamodel.LdaModel at 0x7fa5506eac50>

In [126]:
TOPICS = 25

load_lda_model = gensim.models.wrappers.LdaMallet.load('models/mallet/model_'+str(TOPICS)+'.gensim')


# convert the ldaMallet to LdaModel. It was the only way to get some result with loading mallet model.
load_lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(load_lda_model)

topics = [[(term, round(wt, 3))
               for term, wt in load_lda_model.show_topic(n, topn=20)]
                   for n in range(0, load_lda_model.num_topics)]

pd.set_option('display.max_colwidth', -1)

topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                              for topic in topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, load_lda_model.num_topics+1)]
                         )

In [127]:
def get_topic_predictions(topic_model, corpus, topn=3):
    topic_predictions = topic_model[corpus]
    best_topics = [[(topic, round(wt, 3)) 
                        for topic, wt in sorted(topic_predictions[i], 
                                                key=lambda row: -row[1])[:topn]] 
                            for i in range(len(topic_predictions))]
    return best_topics

In [128]:
topic_preds = get_topic_predictions(topic_model=load_lda_model, 
                                    corpus=norm_bow_features, topn=2)

In [129]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_pre_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, wt in item] for item in topic_preds]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Contribution %'] = [topic_wt for topic_list in 
                                        [[round(wt*100, 2) 
                                              for topic_num, wt in item] 
                                                 for item in topic_preds] 
                                    for topic_wt in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Title'] = [new_pre_titles[i-1][:200] for i in results_df.index.values]
results_df['Paper Desc'] = [new_pre_papers[i-1][:200] for i in results_df.index.values]


pd.set_option('display.max_colwidth', 300)

results_df.sort_values(by='Contribution %', ascending=False)

Unnamed: 0_level_0,Dominant Topics,Contribution %,Topic Desc,Title,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
28,13,67.4,"study, health, people, dr, drug, case, doctor, patient, year, research, found, medical, risk, problem, hospital, treatment, researcher, test, time, result",Could Cancer Drugs Treat Autism?,"five years ago, on charlie ryan’s second birthday, a big lump mysteriously formed on the side of his abdomen. at the emergency room his parents took him to, doctors suggested the lump was a hernia cau"
45,12,58.8,"republican, bill, democrat, house, senate, congress, ryan, senator, vote, plan, session, conservative, member, committee, people, health_care, american, obamacare, support, president",Obamacare Repeal Moves a Step Closer to Reality,republicans in congress can now repeal most of the affordable care act with a simple majority vote. the house on friday passed a budget resolution allowing congress to bypass a democratic filibuster i
30,13,52.7,"study, health, people, dr, drug, case, doctor, patient, year, research, found, medical, risk, problem, hospital, treatment, researcher, test, time, result",Resistance to the Antibiotic of Last Resort Is Silently Spreading,"the alarm bells sounded on november 18, 2015. antibiotic resistance is usually a crisis, one of the reasons its danger can be hard to convey. one by one, over the years, the drugs used to fight the"
38,16,51.1,"show, film, star, movie, year, actor, character, song, music, series, love, play, hollywood, audience, book, time, fan, award, performance, story",The Atlantic’s Week in Culture,"don’t miss, la la land’s nostalgia — david sims analyzes the nature of time in damien chazelle’s film, which pays homage to old hollywood while portraying the limits of worshipping the past. filmpa"
5,2,47.2,"official, email, investigation, report, clinton, information, russian, fbi, intelligence, russia, campaign, comey, document, committee, evidence, government, source, time, agency, hillary_clinton",Trump: ’I Think’ Hacking Was Russian,"updated at 12:25 p. m. after months of equivocating on the origin of cyberattacks that targeted democrats before the election, donald trump said wednesday that he thinks russia was behind the intrus"
...,...,...,...,...,...
26,14,7.3,"people, medium, thing, news, time, story, breitbart_news, twitter, show, question, interview, lot, fact, breitbart, fox_news, asked, talk, reporter, tweet, guy",Is Obamacare Just Bad Branding?,"the senate began the process of dismantling obamacare early thursday morning, passing a budget blueprint that sets the stage for a formal repeal vote, potentially within a few weeks. the step is, for"
5,7,6.8,"trump, president, obama, white_house, donald_trump, administration, day, american, washington, meeting, office, policy, president_barack, national_security, presidency, friday, issue, election, told, statement",Trump: ’I Think’ Hacking Was Russian,"updated at 12:25 p. m. after months of equivocating on the origin of cyberattacks that targeted democrats before the election, donald trump said wednesday that he thinks russia was behind the intrus"
38,24,6.8,"people, black, america, american, white, history, church, world, christian, community, political, word, movement, nation, speech, country, life, group, gay, god",The Atlantic’s Week in Culture,"don’t miss, la la land’s nostalgia — david sims analyzes the nature of time in damien chazelle’s film, which pays homage to old hollywood while portraying the limits of worshipping the past. filmpa"
45,19,5.7,"american, job, people, worker, country, work, tax, year, policy, economy, economic, make, trade, america, good, plan, world, government, system, problem",Obamacare Repeal Moves a Step Closer to Reality,republicans in congress can now repeal most of the affordable care act with a simple majority vote. the house on friday passed a budget resolution allowing congress to bypass a democratic filibuster i
