# Latent Dirichlet Allocation

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ds_cat_head_descr.csv')

In [3]:
df.head()

Unnamed: 0,category,head_descr
0,CRIME,"there were 2 mass shootings in teas last week,..."
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...


Notice how we don't have the topic of the articles! Let's use LDA to attempt to figure out clusters of the articles.

## Preprocessing

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

**`max_df`**` : float in range [0.0, 1.0] or int, default=1.0`<br>
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**`min_df`**` : float in range [0.0, 1.0] or int, default=1`<br>
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [5]:
cv = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')

In [6]:
dtm = cv.fit_transform(df['head_descr'])

In [7]:
dtm

<148982x76968 sparse matrix of type '<class 'numpy.int64'>'
	with 2393412 stored elements in Compressed Sparse Row format>

## LDA

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
LDA = LatentDirichletAllocation(n_components=41,random_state=42)

In [10]:
# This can take awhile, we're dealing with a large amount of documents!
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=41, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

## Showing Stored Words

In [11]:
len(cv.get_feature_names())

76968

In [12]:
import random

In [13]:
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(cv.get_feature_names()[random_word_id])

inslee
conscious
neonatologist
lemonades
malecon
jiang
distressed
debora
chun
diverge


In [14]:
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(cv.get_feature_names()[random_word_id])

enemigo
proprioception
kiana
basler
endangers
kitted
marsha
archived
confiscated
gratuitous


### Showing Top Words Per Topic

In [15]:
len(LDA.components_)

41

In [16]:
LDA.components_

array([[2.51540203e-02, 3.91471015e+01, 2.43902439e-02, ...,
        2.43902439e-02, 2.43902439e-02, 2.43902439e-02],
       [2.43902439e-02, 5.05418079e+01, 2.43902439e-02, ...,
        2.43902439e-02, 2.43902439e-02, 2.43902439e-02],
       [2.43902439e-02, 8.24580262e-02, 2.43902439e-02, ...,
        2.43902439e-02, 2.43902439e-02, 2.43902439e-02],
       ...,
       [2.43902439e-02, 1.42752259e+02, 2.43902439e-02, ...,
        2.43902439e-02, 2.43902439e-02, 2.43902439e-02],
       [2.43902439e-02, 8.56778182e+01, 2.43902439e-02, ...,
        2.43902439e-02, 2.43902439e-02, 2.43902439e-02],
       [2.43902439e-02, 1.80892149e+00, 2.43902439e-02, ...,
        2.43902439e-02, 2.43902439e-02, 2.43902439e-02]])

In [17]:
len(LDA.components_[0])

76968

In [18]:
single_topic = LDA.components_[0]

In [19]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([38483, 47491, 47492, ..., 73587, 25824, 25396], dtype=int64)

In [21]:
# Word most representative of this topic
single_topic[42993]

0.024390243902439025

In [22]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([68881, 13880, 52334, 59006, 73456, 74655, 32236, 73587, 25824,
       25396], dtype=int64)

In [23]:
top_word_indices = single_topic.argsort()[-10:]

In [24]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

thousands
clothing
plane
roundup
videos
weekly
home
vintage
flight
finds


These look like business articles perhaps... Let's confirm by using .transform() on our vectorized articles to attach a label number. But first, let's view all the 10 topics found.

In [27]:
topWords = []
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    topWords.append([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print(topWords[index])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['travel', 'cuba', 'ebay', 'photos', 'french', 'thousands', 'clothing', 'plane', 'roundup', 'videos', 'weekly', 'home', 'vintage', 'flight', 'finds']


THE TOP 15 WORDS FOR TOPIC #1
['francis', 'comey', 'asian', 'fbi', 'chinese', 'countries', 'country', 'trade', 'refugees', 'war', 'pope', 'america', 'american', 'china', 'world']


THE TOP 15 WORDS FOR TOPIC #2
['dog', 'cnn', 'rape', 'pence', 'kelly', 'says', 'steve', 'fo', 'mike', 'assault', 'news', 'trump', 'seual', 'debate', 'women']


THE TOP 15 WORDS FOR TOPIC #3
['host', 'james', 'hate', 'said', 'colbert', 'hill', 'says', 'stephen', 'news', 'like', 'black', 'women', 'men', 'house', 'white']


THE TOP 15 WORDS FOR TOPIC #4
['abuse', 'stone', 'accused', 'wage', 'carpet', 'harassment', 'jennifer', 'boys', 'actress', 'black', 'golden', 'hollywood', 'seual', 'brown', 'red']


THE TOP 15 WORDS FOR TOPIC #5
['year', 'crisis', 'states', 'valentine', 'korea', 'global', 'united', 'water', 'north', 'war', 'world

In [28]:
topWords

[['travel',
  'cuba',
  'ebay',
  'photos',
  'french',
  'thousands',
  'clothing',
  'plane',
  'roundup',
  'videos',
  'weekly',
  'home',
  'vintage',
  'flight',
  'finds'],
 ['francis',
  'comey',
  'asian',
  'fbi',
  'chinese',
  'countries',
  'country',
  'trade',
  'refugees',
  'war',
  'pope',
  'america',
  'american',
  'china',
  'world'],
 ['dog',
  'cnn',
  'rape',
  'pence',
  'kelly',
  'says',
  'steve',
  'fo',
  'mike',
  'assault',
  'news',
  'trump',
  'seual',
  'debate',
  'women'],
 ['host',
  'james',
  'hate',
  'said',
  'colbert',
  'hill',
  'says',
  'stephen',
  'news',
  'like',
  'black',
  'women',
  'men',
  'house',
  'white'],
 ['abuse',
  'stone',
  'accused',
  'wage',
  'carpet',
  'harassment',
  'jennifer',
  'boys',
  'actress',
  'black',
  'golden',
  'hollywood',
  'seual',
  'brown',
  'red'],
 ['year',
  'crisis',
  'states',
  'valentine',
  'korea',
  'global',
  'united',
  'water',
  'north',
  'war',
  'world',
  'women',
  'cl

In [29]:
dictOfWords = { i : topWords[i] for i in range(0, len(topWords) ) }

In [30]:
dictOfWords

{0: ['travel',
  'cuba',
  'ebay',
  'photos',
  'french',
  'thousands',
  'clothing',
  'plane',
  'roundup',
  'videos',
  'weekly',
  'home',
  'vintage',
  'flight',
  'finds'],
 1: ['francis',
  'comey',
  'asian',
  'fbi',
  'chinese',
  'countries',
  'country',
  'trade',
  'refugees',
  'war',
  'pope',
  'america',
  'american',
  'china',
  'world'],
 2: ['dog',
  'cnn',
  'rape',
  'pence',
  'kelly',
  'says',
  'steve',
  'fo',
  'mike',
  'assault',
  'news',
  'trump',
  'seual',
  'debate',
  'women'],
 3: ['host',
  'james',
  'hate',
  'said',
  'colbert',
  'hill',
  'says',
  'stephen',
  'news',
  'like',
  'black',
  'women',
  'men',
  'house',
  'white'],
 4: ['abuse',
  'stone',
  'accused',
  'wage',
  'carpet',
  'harassment',
  'jennifer',
  'boys',
  'actress',
  'black',
  'golden',
  'hollywood',
  'seual',
  'brown',
  'red'],
 5: ['year',
  'crisis',
  'states',
  'valentine',
  'korea',
  'global',
  'united',
  'water',
  'north',
  'war',
  'world'

### Attaching Discovered Topic Labels to Original Articles

In [31]:
dtm

<148982x76968 sparse matrix of type '<class 'numpy.int64'>'
	with 2393412 stored elements in Compressed Sparse Row format>

In [32]:
dtm.shape

(148982, 76968)

In [33]:
len(df)

148982

In [34]:
topic_results = LDA.transform(dtm)

In [35]:
topic_results.shape

(148982, 41)

In [36]:
topic_results[0]

array([0.00187617, 0.00187617, 0.00187617, 0.00187617, 0.00187617,
       0.00187617, 0.00187617, 0.00187617, 0.00187617, 0.00187617,
       0.00187617, 0.00187617, 0.00187617, 0.00187617, 0.00187617,
       0.00187617, 0.00187617, 0.10334108, 0.00187617, 0.00187617,
       0.00187617, 0.00187617, 0.00187617, 0.00187617, 0.00187617,
       0.00187617, 0.00187617, 0.00187617, 0.31574099, 0.00187617,
       0.00187617, 0.12864809, 0.00187617, 0.00187617, 0.00187617,
       0.00187617, 0.00187617, 0.00187617, 0.38285145, 0.00187617,
       0.00187617])

In [37]:
topic_results[0].round(2)

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.1 , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.32, 0.  , 0.  , 0.13, 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.38, 0.  , 0.  ])

In [38]:
topic_results[0].argmax()

38

This means that our model thinks that the first article belongs to topic #1.

### Combining with Original Data

In [39]:
df.head()

Unnamed: 0,category,head_descr
0,CRIME,"there were 2 mass shootings in teas last week,..."
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...


In [40]:
topic_results.argmax(axis=1)

array([38,  8, 40, ..., 10,  0, 37], dtype=int64)

In [41]:
df['Topic'] = topic_results.argmax(axis=1)

In [42]:
df.head(10)

Unnamed: 0,category,head_descr,Topic
0,CRIME,"there were 2 mass shootings in teas last week,...",38
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...,8
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...,40
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,37
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...,3
5,ENTERTAINMENT,donald trump is lovin' new mcdonald's jingle i...,14
6,ENTERTAINMENT,what to watch on amazon prime that’s new this ...,31
7,ENTERTAINMENT,mike myers reveals he'd 'like to' do a fourth ...,8
8,ENTERTAINMENT,what to watch on hulu that’s new this week you...,8
9,ENTERTAINMENT,justin timberlake visits teas school shooting ...,35


In [43]:
df['Top Words'] = df.Topic.map(dictOfWords)

In [46]:
df.head(20)

Unnamed: 0,category,head_descr,Topic,Top Words
0,CRIME,"there were 2 mass shootings in teas last week,...",38,"[violence, man, teas, death, people, says, ant..."
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...,8,"[year, actor, night, comedy, movies, names, tv..."
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...,40,"[vote, democrats, house, republicans, republic..."
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,37,"[case, uber, fda, said, noah, black, francisco..."
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...,3,"[host, james, hate, said, colbert, hill, says,..."
5,ENTERTAINMENT,donald trump is lovin' new mcdonald's jingle i...,14,"[homeless, tweets, lgbt, st, jimmy, civil, 000..."
6,ENTERTAINMENT,what to watch on amazon prime that’s new this ...,31,"[paris, york, photos, year, 2012, best, 2017, ..."
7,ENTERTAINMENT,mike myers reveals he'd 'like to' do a fourth ...,8,"[year, actor, night, comedy, movies, names, tv..."
8,ENTERTAINMENT,what to watch on hulu that’s new this week you...,8,"[year, actor, night, comedy, movies, names, tv..."
9,ENTERTAINMENT,justin timberlake visits teas school shooting ...,35,"[huffington, breath, sandy, coast, beach, skin..."


In [45]:
df.head_descr[4]

'morgan freeman \'devastated\' that seual harassment claims could undermine legacy "it is not right to equate horrific incidents of seual assault with misplaced compliments or humor," he said in a statement.'