# modeling using Non-Negative Matrix Factorization (NMF) on a dataset of articles (presumably from NPR)

In [1]:
import zipfile


with zipfile.ZipFile("npr.csv.zip", "r") as zip_ref:
    zip_ref.extractall()

In [2]:
import pandas as pd
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article'])

In [4]:
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=7,random_state=42)
nmf_model.fit(dtm)

In [5]:
len(tfidf.get_feature_names_out())

54777

In [7]:
import random
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(tfidf.get_feature_names_out()[random_word_id])

sentimental
bagley
saucer
ula
shampoos
amounted
dings
hungarians
spices
harnessed


In [8]:
len(nmf_model.components_)

7

In [9]:
nmf_model.components_

len(nmf_model.components_[0])

54777

In [10]:
single_topic = nmf_model.components_[0]

single_topic.argsort()

array([    0, 27208, 27206, ..., 36283, 54692, 42993])

In [11]:
len(single_topic)

54777

In [12]:
single_topic[18302]

0.0

In [13]:
single_topic[42993]

2.0162023252917702

In [14]:
single_topic.argsort()[-10:]

array([14441, 36310, 53989, 52615, 47218, 53152, 19307, 36283, 54692,
       42993])

In [15]:
top_word_indices = single_topic.argsort()[-10:]

for index in top_word_indices:
    print(tfidf.get_feature_names_out()[index])

disease
percent
women
virus
study
water
food
people
zika
says


In [16]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

dtm.shape

THE TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'al

(11992, 54777)

In [17]:
len(npr)

11992

In [18]:
topic_results = nmf_model.transform(dtm)

topic_results.shape

(11992, 7)

In [19]:
topic_results[0].round(2)

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [20]:
topic_results[0].argmax()

1

In [21]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [22]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3])

In [23]:
npr['Topic'] = topic_results.argmax(axis=1)



topicdict  = {0:'health',1:'election',2:'legis',3:'policy',4:'candidates',5:'music',6:'educaion'}

npr['Topic Label'] = npr['Topic'].map(topicdict)

npr.head(10)

Unnamed: 0,Article,Topic,Topic Label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,policy
4,"From photography, illustration and video, to d...",6,educaion
5,I did not want to join yoga class. I hated tho...,5,music
6,With a who has publicly supported the debunk...,0,health
7,"I was standing by the airport exit, debating w...",0,health
8,"If movies were trying to be more realistic, pe...",0,health
9,"Eighteen years ago, on New Year’s Eve, David F...",5,music
