In [4]:
import numpy as np
import pandas as pd

In [5]:
npr = pd.read_csv('npr.csv')

In [6]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [9]:
dtm = tfidf.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.decomposition import NMF

In [11]:
nmf_model = NMF(n_components=7,random_state=42)
nmf_model.fit(dtm)

In [12]:
nmf_model.get_feature_names_out()

array(['nmf0', 'nmf1', 'nmf2', 'nmf3', 'nmf4', 'nmf5', 'nmf6'],
      dtype=object)

In [13]:
nmf_model.get_feature_names_out()[0]

'nmf0'

In [15]:
for index,topic in enumerate(nmf_model.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC # {index}")
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC # 0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC # 1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC # 2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC # 3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC # 4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC # 5
['love', 've', 'don

In [17]:
topic_results = nmf_model.transform(dtm)
topic_results[0]

array([0.        , 0.12079653, 0.00139891, 0.05915242, 0.01519226,
       0.        , 0.        ])

In [19]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3], dtype=int64)

In [20]:
npr['Topic'] = topic_results.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [23]:
mytopic_dict = {0:'heath',1:'election1',2:'legislation',3:'politics',4:'election2',5:'music',6:'education'}
npr['Topic Label'] = npr['Topic'].map(mytopic_dict)

In [24]:
npr

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,election1
1,Donald Trump has used Twitter — his prefe...,1,election1
2,Donald Trump is unabashedly praising Russian...,1,election1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,politics
4,"From photography, illustration and video, to d...",6,education
...,...,...,...
11987,The number of law enforcement officers shot an...,3,politics
11988,"Trump is busy these days with victory tours,...",1,election1
11989,It’s always interesting for the Goats and Soda...,0,heath
11990,The election of Donald Trump was a surprise to...,4,election2
