In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('npr.csv')
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [7]:
# data preprocessing

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [10]:
dtm = cv.fit_transform(df['Article'])

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [12]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [13]:
# get topics
first_topic = LDA.components_[0]

In [14]:
len(first_topic)

54777

In [15]:
first_topic.argsort()[-10:]
for word in first_topic.argsort()[-10:]:
    print(cv.get_feature_names()[word])

new
percent
government
company
million
care
people
health
said
says


In [16]:
for index, topic in enumerate(LDA.components_):
    print(f'Top 15 words in topic:{index}')
    print([ cv.get_feature_names()[index] for index in topic.argsort()[-15:]])

Top 15 words in topic:0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']
Top 15 words in topic:1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']
Top 15 words in topic:2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']
Top 15 words in topic:3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']
Top 15 words in topic:4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']
Top 15 words in topic:5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think', 'people', 'just', 'like']
Top 15 words in top

In [17]:
top_results = LDA.transform(dtm)

In [19]:
df['Topic'] = top_results.argmax(axis=1)
df

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4


In [20]:
# N-NMF for topic modelling

In [21]:
import pandas as pd

In [23]:
npr = pd.readnpr.csvr.csvsv('npr.csv')

In [25]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_df=0.90, min_df=2, stop_words='english')

In [33]:
dtm = tfidf.fit_transform(npr['Article'])

In [34]:
from sklearn.decomposition import NMF

In [36]:
nmf_model = NMF(n_components=7, random_state=42)

In [38]:
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [54]:
for index in nmf_model.components_:
    print([tfidf.get_feature_names()[index] for index in index.argsort()[-10:]])
    print('\n')

['disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


['election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


['tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


['isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


['party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


['time', 'song', 'life', 'really', 'know', 'people', 'think', 'just', 'music', 'like']


['devos', 'children', 'college', 'kids', 'teachers', 'student', 'education', 'schools', 'school', 'students']




In [64]:
topic_result = nmf_model.transform(dtm)

In [65]:
topic_result.shape

(11992, 7)

In [74]:
topic_result[0].argmax()

1