In [1]:
from sklearn.datasets import fetch_20newsgroups
# More info: https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

# 18K newsgroups posts
# 20 topics (train/test) - split based upon date of posting

In [2]:
data = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))
docs = data["data"] # list
targets = data["target"]
target_names = data["target_names"]
topics = [data["target_names"][i] for i in data["target"]] # list

In [3]:
print(len(docs))

18846


In [4]:
from random import randint
i = randint(0,18846)
print('Document')
print(docs[i])
print('\nTopic')
print(topics[i])

Document
For Sale:

OS/2 2.0 Extended Services -

        * Extended Database support
        * Extended Networking Support
        * Remote Host support
        * Extended Communication Support

PLUS! A copy of OS/2 2.0.  The ES package is brand new and uninstalled, all
manuals, disks, etc. are included.  The ES package retails for $495 with OS/2
2.0 selling for $79 or something like that.

I'll let both of them go for $200.  My needs changed thus eliminating my
need for the package once I bought it.

If Interested, please Email me at:

Topic
misc.forsale


In [5]:
import pandas as pd
import numpy as np 

df = pd.DataFrame(data=zip(docs,topics), columns=["Document","Topic"])

In [6]:
df.sample(5)

Unnamed: 0,Document,Topic
13518,"\nHaving lived, played, and worked on and near...",talk.politics.guns
2136,I am trying to get my system to work with a Ta...,comp.sys.ibm.pc.hardware
5556,I just bought a little gizmo that is supposed ...,sci.electronics
9589,STOP! STOP! STOP! STOP! This argument is getti...,comp.graphics
2776,"\n\n\t\n\n\tHey, Bosio threw a no-no what the ...",rec.sport.baseball


### We need a list of documents

In [7]:
docs[0]

"\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

# Top2Vec

In [8]:
from top2vec import Top2Vec

  from .autonotebook import tqdm as notebook_tqdm


### Creating the model

In [10]:
model = Top2Vec(documents = docs)

2022-07-14 23:36:47,238 - top2vec - INFO - Pre-processing documents for training
2022-07-14 23:36:52,471 - top2vec - INFO - Creating joint document/word embedding
2022-07-14 23:40:34,033 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-14 23:40:40,399 - top2vec - INFO - Finding dense areas of documents
2022-07-14 23:40:41,264 - top2vec - INFO - Finding topics


### Number of docs per topic

In [11]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)

[1501 1100  885  728  654  602  560  394  378  376  372  356  335  309
  308  291  287  269  266  244  238  231  229  218  215  214  214  208
  198  189  182  176  176  171  167  158  154  142  140  139  125  118
  114  114  112  111  108  106  105  104  104  100  100   99   96   92
   90   86   83   80   78   77   76   76   75   73   71   71   70   69
   69   67   67   67   65   64   64   64   64   62   59   58   57   56
   56   56   55   54   53   53   52   52   51   51   51   50   49   49
   49   49   48   47   45   45   44   40   39   34   34]


### The model idetifies 112 topics

In [22]:
print(topic_nums[-1])

111


In [23]:
topic_words, word_scores, topic_nums = model.get_topics(10)

In [24]:
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

0
words: ['bike' 'car' 'ride' 'bikes' 'honda' 'cars' 'tires' 'brakes' 'mph'
 'engine' 'rear' 'wheel' 'gear' 'riding' 'wheels' 'motorcycle' 'brake'
 'suspension' 'throttle' 'passenger' 'tire' 'miles' 'toyota' 'ford'
 'driving' 'front' 'steering' 'seat' 'bmw' 'shaft' 'valve' 'torque'
 'cylinder' 'parking' 'clutch' 'dealer' 'pedal' 'rpm' 'stock' 'bought'
 'motorcycles' 'abs' 'highway' 'mileage' 'cage' 'fun' 'exhaust' 'owner'
 'buy' 'expensive']
1
words: ['really' 'think' 'don' 'guess' 'know' 're' 'wouldn' 'you' 'maybe' 'ever'
 'just' 'me' 'anything' 'like' 'say' 'why' 'people' 'do' 'something'
 'stupid' 'flame' 'someone' 'everyone' 'believe' 'going' 'oh' 'isn' 'tell'
 'admit' 'didn' 'feel' 'lot' 'yeah' 'shit' 'things' 'wrong' 'sorry'
 'nothing' 'again' 'what' 'even' 'here' 'exactly' 'too' 'thought' 'happen'
 'take' 'we' 'about' 'being']
2
words: ['pitching' 'hitter' 'pitchers' 'pitcher' 'rbi' 'inning' 'innings' 'pitch'
 'hitting' 'braves' 'batting' 'pitches' 'pitched' 'hit' 'team' 'teams'

In [25]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=0, num_docs=10)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f'Document:{doc_id}, Score:{score}')
    print('------------------')
    print(doc)
    print('------------------')
    print()

Document:14048, Score:0.6952275037765503
------------------
Well my last two motorcycles have been shaft driven and they will wheelie.
The rear gear does climb the ring gear and lift the rear which gives an
odd feel, but it still wheelies.

------------------

Document:16034, Score:0.6860707998275757
------------------

	Not to start *another* shaft effect discussion, but the twist you
	feel when revving sitting still is due to the larger fly-wheels that
	the BMW R-bikes (maybe K's too, dunno) use. If you whack the throttle
	at stop lights, it'll really rock the bike over (to the right).

	<snip> 	<snip>

	Please post if you come to any conclusion on this. I am thinking
	of putting a light on each cylinder guard on my R100S, and was
	wondering whether I was going to have to switch lights off every
	time I was under 5,000 RPM :-)



------------------

Document:8981, Score:0.6725623607635498
------------------
]Is it possible to do a "wheelie" on a motorcycle with shaft-drive?

yes.

--

### Playing with hyperparameters

### min_cluster_size = 50

In [12]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':50, 'min_samples':None, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)

2022-07-14 23:57:14,609 - top2vec - INFO - Pre-processing documents for training
2022-07-14 23:57:19,848 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 00:01:09,550 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 00:01:16,022 - top2vec - INFO - Finding dense areas of documents
2022-07-15 00:01:16,894 - top2vec - INFO - Finding topics


In [18]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 1


0
words: ['but' 'anyone' 'some' 'quite' 'any' 'very' 'this' 'sort' 'probably'
 'someone' 'few' 'idea' 'really' 'guess' 'be' 'please' 'think' 'don'
 'pretty' 'ideas' 'more' 'enough' 'might' 'to' 'thanks' 'if' 'rather' 'am'
 'sure' 'would' 'get' 'make' 'making' 'you' 'something' 'your' 'things'
 'point' 'bike' 'just' 'that' 'like' 'doing' 'little' 'good' 'lot' 'real'
 'interesting' 'one' 'makes']
1
words: ['shameful' 'dsl' 'jxp' 'geb' 'chastity' 'intellect' 'skepticism' 'cadre'
 'pitt' 'gordon' 'surrender' 'banks' 'soon' 'too' 'patients' 'lyme'
 'effective' 'patient' 'it' 'rare' 'sometimes' 'heart' 'treatment'
 'physician' 'much' 'is' 'weight' 'practical' 'disease' 'drugs'
 'difficult' 'medicine' 'physicians' 'doctor' 'hurt' 'pressure' 'hope'
 'blood' 'mean' 'gant' 'isn' 'safe' 'wouldn' 'afraid' 'breath' 'gps'
 'dose' 'citizen' 'especially' 'walks']


### min_cluster_size = 20

In [19]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':20, 'min_samples':None, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)

2022-07-15 01:06:54,332 - top2vec - INFO - Pre-processing documents for training
2022-07-15 01:06:59,746 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 01:10:43,272 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 01:10:49,766 - top2vec - INFO - Finding dense areas of documents
2022-07-15 01:10:50,618 - top2vec - INFO - Finding topics


In [20]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 85


0
words: ['god' 'jesus' 'christ' 'faith' 'bible' 'christians' 'scripture'
 'christianity' 'truth' 'scriptures' 'heaven' 'belief' 'life' 'christian'
 'holy' 'believers' 'spirit' 'sins' 'eternal' 'gospel' 'revelation'
 'spiritual' 'luke' 'doctrines' 'love' 'salvation' 'church' 'contradict'
 'theology' 'testament' 'himself' 'prophets' 'doctrine' 'sin' 'religions'
 'evil' 'interpretation' 'believing' 'biblical' 'beliefs' 'lord' 'divine'
 'nature' 'religion' 'believe' 'resurrection' 'existence' 'romans' 'psalm'
 'worship']
1
words: ['bike' 'car' 'ride' 'cars' 'honda' 'brakes' 'rear' 'bikes' 'tires'
 'engine' 'wheel' 'riding' 'mph' 'wheels' 'miles' 'gear' 'throttle'
 'suspension' 'brake' 'motorcycle' 'toyota' 'tire' 'ford' 'passenger'
 'driving' 'steering' 'seat' 'bmw' 'front' 'valve' 'cylinder' 'torque'
 'shaft' 'mileage' 'pedal' 'stock' 'parking' 'abs' 'clutch' 'highway'
 'buy' 'rpm' 'bought' 'fun' 'exhaust' 'buying' 'dealer' 'lights' 'mile'
 'sho']


### min_cluster_size = 25

In [21]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':25, 'min_samples':None, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)

2022-07-15 01:12:04,788 - top2vec - INFO - Pre-processing documents for training
2022-07-15 01:12:10,184 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 01:15:49,133 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 01:15:55,608 - top2vec - INFO - Finding dense areas of documents
2022-07-15 01:15:56,489 - top2vec - INFO - Finding topics


In [24]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 1


0
words: ['few' 'sort' 'guess' 'but' 'very' 'more' 'ideas' 'any' 'some' 'lot'
 'appreciate' 'enough' 'just' 'really' 'way' 'those' 'like' 'time' 'long'
 'it' 'probably' 'anything' 'either' 'your' 'someone' 'almost' 'idea'
 'much' 'make' 'quite' 'having' 'where' 'anyone' 'thanks' 'little' 'folks'
 'for' 'something' 'could' 'can' 'important' 'well' 'you' 'actually' 'get'
 'pretty' 'here' 'even' 'serious' 'find']
1
words: ['jxp' 'shameful' 'skepticism' 'dsl' 'chastity' 'intellect' 'geb' 'cadre'
 'pitt' 'surrender' 'gordon' 'banks' 'soon' 'too' 'lyme' 'patients'
 'patient' 'afraid' 'dose' 'disease' 'effective' 'it' 'physician'
 'physicians' 'meant' 'weight' 'treated' 'enough' 'doing' 'sometimes'
 'medicine' 'pressure' 'few' 'hurt' 'blood' 'bread' 'edu' 'gant' 'doctor'
 'meat' 'rare' 'emergency' 'breath' 'fault' 'citizen' 'probably' 'county'
 'liver' 'recommendations' 'become']


### min_cluster_size = 23

In [25]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':23, 'min_samples':None, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)

2022-07-15 01:17:03,073 - top2vec - INFO - Pre-processing documents for training
2022-07-15 01:17:08,431 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 01:20:48,648 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 01:20:55,452 - top2vec - INFO - Finding dense areas of documents
2022-07-15 01:20:55,767 - top2vec - INFO - Finding topics


In [26]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 61


0
words: ['god' 'jesus' 'christ' 'faith' 'christianity' 'scripture' 'bible'
 'christians' 'belief' 'truth' 'scriptures' 'heaven' 'holy' 'gospel'
 'believers' 'eternal' 'christian' 'spirit' 'revelation' 'church' 'life'
 'sins' 'luke' 'theology' 'divine' 'lord' 'doctrines' 'contradict' 'sin'
 'salvation' 'spiritual' 'believe' 'nature' 'doctrine' 'prophets'
 'biblical' 'accept' 'testament' 'resurrection' 'corinthians' 'evil'
 'meaning' 'matthew' 'religion' 'beliefs' 'judgement' 'notion' 'passage'
 'sense' 'believing']
1
words: ['car' 'bike' 'cars' 'ride' 'tires' 'honda' 'brakes' 'bikes' 'rear'
 'engine' 'mph' 'riding' 'wheel' 'gear' 'wheels' 'throttle' 'brake'
 'motorcycle' 'suspension' 'tire' 'passenger' 'miles' 'steering' 'toyota'
 'driving' 'front' 'seat' 'ford' 'torque' 'cylinder' 'valve' 'bmw'
 'clutch' 'pedal' 'shaft' 'fun' 'abs' 'parking' 'dealer' 'mileage' 'cage'
 'highway' 'stock' 'buy' 'oil' 'buying' 'rpm' 'lights' 'exhaust' 'tank']


### min_cluster_size = 24

In [27]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':24, 'min_samples':None, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)

2022-07-15 01:21:32,079 - top2vec - INFO - Pre-processing documents for training
2022-07-15 01:21:37,759 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 01:25:16,108 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 01:25:23,889 - top2vec - INFO - Finding dense areas of documents
2022-07-15 01:25:24,205 - top2vec - INFO - Finding topics


In [29]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(5)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 67


0
words: ['god' 'jesus' 'faith' 'christ' 'christians' 'christianity' 'truth'
 'bible' 'scripture' 'life' 'eternal' 'believers' 'belief' 'gospel'
 'scriptures' 'christian' 'church' 'heaven' 'lord' 'holy' 'divine' 'evil'
 'sins' 'believe' 'revelation' 'spiritual' 'luke' 'doctrine' 'biblical'
 'contradict' 'nature' 'believing' 'spirit' 'accept' 'religions' 'love'
 'theology' 'salvation' 'resurrection' 'testament' 'judgement' 'doctrines'
 'beliefs' 'prophets' 'atheists' 'corinthians' 'religion' 'meaning'
 'teachings' 'sin']
1
words: ['car' 'bike' 'cars' 'ride' 'honda' 'bikes' 'tires' 'engine' 'riding'
 'rear' 'brakes' 'mph' 'wheel' 'wheels' 'gear' 'suspension' 'motorcycle'
 'brake' 'tire' 'throttle' 'miles' 'bmw' 'driving' 'toyota' 'front' 'ford'
 'passenger' 'seat' 'steering' 'cylinder' 'torque' 'valve' 'shaft'
 'clutch' 'pedal' 'buy' 'parking' 'stock' 'highway' 'mileage' 'road' 'abs'
 'fun' 'dealer' 'rpm' 'motorcycles' 'garage' 'exhaust' 'truck' 'owner']
2
words: [

### 'min_samples':5

In [30]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':24, 'min_samples':5, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)  

2022-07-15 01:28:45,986 - top2vec - INFO - Pre-processing documents for training
2022-07-15 01:28:51,343 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 01:32:31,180 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 01:32:37,736 - top2vec - INFO - Finding dense areas of documents
2022-07-15 01:32:38,591 - top2vec - INFO - Finding topics


In [31]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(5)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 103


0
words: ['bike' 'car' 'ride' 'cars' 'honda' 'brakes' 'bikes' 'engine' 'mph' 'rear'
 'tires' 'riding' 'wheel' 'gear' 'wheels' 'motorcycle' 'brake' 'miles'
 'suspension' 'throttle' 'front' 'tire' 'steering' 'driving' 'toyota'
 'ford' 'passenger' 'seat' 'bmw' 'valve' 'pedal' 'cylinder' 'torque'
 'shaft' 'abs' 'clutch' 'parking' 'mileage' 'dealer' 'stock' 'buying'
 'cage' 'bought' 'exhaust' 'lights' 'oil' 'rpm' 'tank' 'highway'
 'motorcycles']
1
words: ['god' 'jesus' 'christ' 'faith' 'christians' 'scripture' 'bible'
 'christianity' 'heaven' 'scriptures' 'truth' 'gospel' 'eternal' 'holy'
 'christian' 'salvation' 'revelation' 'spirit' 'luke' 'lord' 'sins'
 'belief' 'church' 'believers' 'doctrine' 'sin' 'life' 'prophets' 'divine'
 'theology' 'resurrection' 'himself' 'romans' 'doctrines' 'isaiah'
 'passages' 'biblical' 'psalm' 'psalms' 'spiritual' 'testament'
 'corinthians' 'apostles' 'beliefs' 'matthew' 'love' 'contradict' 'verses'
 'worship' 'religions']
2
words: ['t

### 'min_samples':40

In [40]:
from umap import UMAP
from hdbscan import HDBSCAN

hdbscan_args = dict({'min_cluster_size':24, 'min_samples':25, 'metric':'euclidean', 'cluster_selection_method': 'eom'})
model = Top2Vec(documents = docs, hdbscan_args = hdbscan_args)  

2022-07-15 01:47:14,008 - top2vec - INFO - Pre-processing documents for training
2022-07-15 01:47:19,185 - top2vec - INFO - Creating joint document/word embedding
2022-07-15 01:50:59,122 - top2vec - INFO - Creating lower dimension embedding of documents
2022-07-15 01:51:05,829 - top2vec - INFO - Finding dense areas of documents
2022-07-15 01:51:06,148 - top2vec - INFO - Finding topics


In [41]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(f'Number of topics: {topic_nums[-1]}'); print(); print()
topic_words, word_scores, topic_nums = model.get_topics(2)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'words: {words}')

Number of topics: 60


0
words: ['god' 'jesus' 'christianity' 'christ' 'faith' 'bible' 'christians'
 'scripture' 'scriptures' 'truth' 'eternal' 'heaven' 'belief' 'believers'
 'christian' 'life' 'church' 'revelation' 'gospel' 'lord' 'believe'
 'salvation' 'luke' 'prophets' 'sins' 'contradict' 'holy' 'doctrines'
 'divine' 'beliefs' 'testament' 'doctrine' 'spirit' 'resurrection'
 'spiritual' 'religions' 'evil' 'religion' 'theology' 'himself' 'sin'
 'atheists' 'nature' 'biblical' 'love' 'meaning' 'corinthians' 'matthew'
 'say' 'believing']
1
words: ['bike' 'car' 'ride' 'honda' 'cars' 'riding' 'brakes' 'bikes' 'tires'
 'engine' 'mph' 'rear' 'wheel' 'gear' 'suspension' 'motorcycle' 'tire'
 'throttle' 'brake' 'wheels' 'passenger' 'toyota' 'miles' 'ford' 'driving'
 'steering' 'bmw' 'seat' 'torque' 'front' 'valve' 'shaft' 'cylinder'
 'bought' 'buying' 'parking' 'abs' 'pedal' 'clutch' 'rpm' 'road' 'stock'
 'exhaust' 'fun' 'cage' 'motorcycles' 'dealer' 'highway' 'mileage' 'truck']
