In [1]:
# non-negative matrix factorization

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('npr.csv')
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
df.shape

(11992, 1)

In [5]:
df['Article'][0]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this administration’s ineffective foreign policy that has left America weaker in the eyes of the world.” Other GOP leaders sounded much the same theme. ”[We have] been urging President Obama for years to take strong action to deter Russia’s worldwide aggression, including its   operations,” wrote Rep. Devin Nunes,  . chairman of the House Intelligence Committee. ”Now with just a few weeks left in office, the president has suddenly decided that some stronger measures are indeed warranted.” Appearing 

In [6]:
# preprocessing

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')
# max_df will discard words that show up in 90% of documents
# min_df will count word that will atleast show up in 2 documents
# tfidfvectorizer automatically removes stop words

In [8]:
# fit transform to the dataset

In [9]:
dtm = tfidf.fit_transform(df['Article']) # document term matrix
dtm # here we are not doing train test split bcoz this is unsupervised learning

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
# nmf

In [11]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=7, random_state=42)
nmf.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [12]:
# grab the vocabulary of words

In [13]:
len(tfidf.get_feature_names()) # tfidf.get_feature_names is the list of all the words in articles

54777

In [14]:
print(tfidf.get_feature_names()[50000]) 

import random
random_word_id = random.randint(0, 54777) # grabbing any random word in the document
tfidf.get_feature_names()[random_word_id] 

transcribe


'favoritism'

In [15]:
# grab the topics

In [16]:
display(
    len(nmf.components_),
    nmf.components_.shape
)

7

(7, 54777)

In [17]:
# grab the highest probablity words for a single topic

In [18]:
single_topic = nmf.components_[0]
single_topic.argsort() # argsort returns the index position that would sort this array in asc order

array([    0, 27208, 27206, ..., 36283, 54692, 42993])

In [19]:
single_topic.argsort()[-10:] # since we want top 10 values (& that are last 10 values)

array([14441, 36310, 53989, 52615, 47218, 53152, 19307, 36283, 54692,
       42993])

In [20]:
top_ten_words = single_topic.argsort()[-10:]

for index in top_ten_words:
    print(tfidf.get_feature_names()[index])

# this shows these 10 words have a high probablity in that single topic

disease
percent
women
virus
study
water
food
people
zika
says


In [21]:
# grab the highest probablity words per topic

In [22]:
for i,topic in enumerate(nmf.components_):
    print(f"the top 15 words for the topic: #{i}")
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]], '\n')

the top 15 words for the topic: #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says'] 

the top 15 words for the topic: #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump'] 

the top 15 words for the topic: #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health'] 

the top 15 words for the topic: #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police'] 

the top 15 words for the topic: #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton'] 

the top 15 words for the topic

In [23]:
# attach topic numbers to original article

In [24]:
topic_results = nmf.transform(dtm)

In [25]:
topic_results[0].round(2) # probability of document to belong to a particular topic
# 0.12 means topic 0 has 68% chance of belonging to topic 1

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [26]:
topic_results[0].argmax() # returns the index position of highest probability

1

In [27]:
df['Topic'] = topic_results.argmax(axis=1)

In [28]:
mytopic_dict = {0:'Health', 1:'Politics', 2:'Legis', 3:'Security', 
                4:'Election', 5:'Music', 6:'Education'}
df['Topic Label']= df['Topic'].map(mytopic_dict)
df.head()

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,Politics
1,Donald Trump has used Twitter — his prefe...,1,Politics
2,Donald Trump is unabashedly praising Russian...,1,Politics
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Security
4,"From photography, illustration and video, to d...",6,Education
