In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv('npr.csv')

In [4]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [7]:
len(npr['Article'])

11992

In [8]:
npr['Article'][4000]

'The headline shocked the   world of the surface Navy: Seven sailors aboard the destroyer USS Fitzgerald were killed, and other crew members injured, when the warship collided with a cargo vessel off Japan. As the Navy family grieves, both it and the wider world are asking the same question: How did this happen? The short answer is that no one knows  —   yet. Official inquiries into what led up to the encounter could take months or more. The Navy and the U. S. Coast Guard both likely will eventually issue reports that describe what happened and could make recommendations for preventing another such accident. ”I will not speculate on how long these investigations will last,” said Vice Adm. Joseph Aucoin, commander of the Navy’s 7th Fleet. The Fitzgerald and the other ships of Destroyer Squadron 15, based outside Tokyo, fall under his authority. There are clues, however, that explain how something like the Fitzgerald’s collision could happen, including photographs of the ships involved, 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(
    max_df=0.9, # document frequency: very common terms along the documents
    min_df=2, # Minimal document frequency for a word in this count vectorizer has to be 2 documents
    # the terms has to be minimal in 2 documents to be detected in this count vector
    stop_words='english' # remove english stop words (a, the, for, ...)
)

In [12]:
dtm = cv.fit_transform(npr['Article']) # turning data frame into a sparse matrix document

In [13]:
dtm # document x word

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

In [15]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42)

In [16]:
LDA.fit(dtm) # LDA model

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [17]:
# Grab the vocabulary of words

In [18]:
len(cv.get_feature_names())

54777

In [20]:
type(cv.get_feature_names())

list

In [23]:
import random

random_word_id = random.randint(0, 54777)

cv.get_feature_names()[random_word_id]

'irs'

In [None]:
# Grab the topics

In [24]:
len(LDA.components_)

7

In [25]:
type(LDA.components_)

numpy.ndarray

In [27]:
LDA.components_.shape #(topics, words)

(7, 54777)

In [26]:
LDA.components_ # all the topics and words

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [29]:
single_topic = LDA.components_[0] # grabing one topic

In [30]:
single_topic.argsort() # indexes from lowest to highest value

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

### Example

In [31]:
import numpy as np

In [32]:
arr = np.array([10, 200, 1])

In [33]:
arr

array([ 10, 200,   1])

In [34]:
arr.argsort() # indexes from lowest to hieghest value

array([2, 0, 1])

In [35]:
# ARGSORT ---> INDEX POSITIONS SORTED FROM LEAST ---> GREATEST
# TOP 10 VALUES
single_topic.argsort()[-10:] # last 10 values of .argsort()

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993])

In [39]:
top_twenty_words = single_topic.argsort()[-20:]

In [40]:
# twenty most used words in the selected topic
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

president
state
tax
insurance
trump
companies
money
year
federal
000
new
percent
government
company
million
care
people
health
said
says


In [42]:
# Grab the highest probability words per topic
for i,topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know'

In [43]:
# documents belonging to a particular topic
topic_results = LDA.transform(dtm)

In [45]:
topic_results.shape

(11992, 7)

In [46]:
topic_results[0]

array([1.61040465e-02, 6.83341493e-01, 2.25376318e-04, 2.25369288e-04,
       2.99652737e-01, 2.25479379e-04, 2.25497980e-04])

In [47]:
# show percentage relation from the first document with every topic
topic_results[0].round(2)

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [48]:
# grab the index of the highest probability (percentage)
topic_results[0].argmax()

1

In [49]:
npr['Topic'] = topic_results.argmax(axis=1) # topest topic in each document

In [50]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4
