In [2]:
import pandas as pd

In [3]:
npr =  pd.read_csv('./data/npr.csv')

In [4]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
len(npr)

11992

## Latent Dirichlet Allocation

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [7]:
doc_term_matrix = cv.fit_transform(npr['Article'])
doc_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=7,random_state=42)

In [9]:
lda.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [10]:
len(lda.components_)

7

In [11]:
def print_topwords(n,model,vectorizer):
    print(f'The top {n} words\n----------------')
    feature_names = vectorizer.get_feature_names()
    for i,topic in enumerate(model.components_):
        print(f"Topic: {i}")
        print(', '.join([feature_names[index] for index in topic.argsort()[-n:]]),'\n')

In [12]:
print_topwords(15, lda, cv)

The top 15 words
----------------
Topic: 0
companies, money, year, federal, 000, new, percent, government, company, million, care, people, health, said, says 

Topic: 1
military, house, security, russia, government, npr, reports, says, news, people, told, police, president, trump, said 

Topic: 2
way, world, family, home, day, time, water, city, new, years, food, just, people, like, says 

Topic: 3
time, new, don, years, medical, disease, patients, just, children, study, like, women, health, people, says 

Topic: 4
voters, vote, election, party, new, obama, court, republican, campaign, people, state, president, clinton, said, trump 

Topic: 5
years, going, ve, life, don, new, way, music, really, time, know, think, people, just, like 

Topic: 6
student, years, data, science, university, people, time, schools, just, education, new, like, students, school, says 



In [13]:
topic_results = lda.transform(doc_term_matrix)

In [14]:
npr['Topic LDA'] = topic_results.argmax(axis=1)

In [15]:
npr.head(10)

Unnamed: 0,Article,Topic LDA
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",2
8,"If movies were trying to be more realistic, pe...",3
9,"Eighteen years ago, on New Year’s Eve, David F...",2


## Non Negative Matrix Factorization

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [17]:
doc_term_matrix = tfidf.fit_transform(npr['Article'])

In [18]:
doc_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=7,random_state=42)

In [20]:
nmf.fit(doc_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [21]:
print_topwords(15,nmf,tfidf)

The top 15 words
----------------
Topic: 0
new, research, like, patients, health, disease, percent, women, virus, study, water, food, people, zika, says 

Topic: 1
gop, pence, presidential, russia, administration, election, republican, obama, white, house, donald, campaign, said, president, trump 

Topic: 2
senate, house, people, act, law, tax, plan, republicans, affordable, obamacare, coverage, medicaid, insurance, care, health 

Topic: 3
officers, syria, security, department, law, isis, russia, government, state, attack, president, reports, court, said, police 

Topic: 4
primary, cruz, election, democrats, percent, party, delegates, vote, state, democratic, hillary, campaign, voters, sanders, clinton 

Topic: 5
love, ve, don, album, way, time, song, life, really, know, people, think, just, music, like 

Topic: 6
teacher, state, high, says, parents, devos, children, college, kids, teachers, student, education, schools, school, students 



In [22]:
topic_results = nmf.transform(doc_term_matrix)

In [23]:
npr['Topic NMF']=topic_results.argmax(axis=1)

In [24]:
print_topwords(15,lda,cv)

The top 15 words
----------------
Topic: 0
companies, money, year, federal, 000, new, percent, government, company, million, care, people, health, said, says 

Topic: 1
military, house, security, russia, government, npr, reports, says, news, people, told, police, president, trump, said 

Topic: 2
way, world, family, home, day, time, water, city, new, years, food, just, people, like, says 

Topic: 3
time, new, don, years, medical, disease, patients, just, children, study, like, women, health, people, says 

Topic: 4
voters, vote, election, party, new, obama, court, republican, campaign, people, state, president, clinton, said, trump 

Topic: 5
years, going, ve, life, don, new, way, music, really, time, know, think, people, just, like 

Topic: 6
student, years, data, science, university, people, time, schools, just, education, new, like, students, school, says 



In [25]:
print_topwords(15,nmf,tfidf)

The top 15 words
----------------
Topic: 0
new, research, like, patients, health, disease, percent, women, virus, study, water, food, people, zika, says 

Topic: 1
gop, pence, presidential, russia, administration, election, republican, obama, white, house, donald, campaign, said, president, trump 

Topic: 2
senate, house, people, act, law, tax, plan, republicans, affordable, obamacare, coverage, medicaid, insurance, care, health 

Topic: 3
officers, syria, security, department, law, isis, russia, government, state, attack, president, reports, court, said, police 

Topic: 4
primary, cruz, election, democrats, percent, party, delegates, vote, state, democratic, hillary, campaign, voters, sanders, clinton 

Topic: 5
love, ve, don, album, way, time, song, life, really, know, people, think, just, music, like 

Topic: 6
teacher, state, high, says, parents, devos, children, college, kids, teachers, student, education, schools, school, students 



In [27]:
npr.head(20)

Unnamed: 0,Article,Topic LDA,Topic NMF
0,"In the Washington of 2016, even when the polic...",1,1
1,Donald Trump has used Twitter — his prefe...,1,1
2,Donald Trump is unabashedly praising Russian...,1,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1,3
4,"From photography, illustration and video, to d...",2,6
5,I did not want to join yoga class. I hated tho...,3,5
6,With a who has publicly supported the debunk...,3,0
7,"I was standing by the airport exit, debating w...",2,0
8,"If movies were trying to be more realistic, pe...",3,0
9,"Eighteen years ago, on New Year’s Eve, David F...",2,5
