## Imports

In [3]:
import pandas as pd
from top2vec import Top2Vec

## Data

In [4]:
df = pd.read_csv("..\input\ingested_data\cleaned_train_documents.csv")
df.head(3)

Unnamed: 0,document,clean_document
0,My husband is a born shopper. He loves to look...,husband born shopper love look thing touch lik...
1,Tea drinking was common in China for nearly on...,tea drinking common china nearly one thousand ...
2,"Once upon a time, there was a scholar who want...",upon time scholar wanted gain knowledge day ev...


## Top2Vec Model on Clean Documents

In [4]:
docs = df.clean_document.tolist()
docs[10]

'american spend free time various way america country sport hunting fishing swimming team sport like baseball football million american watch favorite sport television also like play community orchestra make film recording go camping visit museum attend lecture travel garden read join hundred activity people also enjoy building thing home sewing clothes even making photograph thing fun well economy much american enjoy free time country time self improvement country million adult continue education chiefly going school evening free time expense added time spent personal activity american devote great amount time varied need community many hospital school library museum park community center organization assist poor depend many hour citizen devote activity often without pay several answer idea cooperating sharing responsibility one another benefit old country country first founded necessary settler work together live crossed dangerous sea risked struggle political religious freedom remai

In [5]:
print(Top2Vec.__doc__)


    Top2Vec

    Creates jointly embedded topic, document and word vectors.


    Parameters
    ----------
    documents: List of str
        Input corpus, should be a list of strings.

    min_count: int (Optional, default 50)
        Ignores all words with total frequency lower than this. For smaller
        corpora a smaller min_count will be necessary.

    topic_merge_delta: float (default 0.1)
        Merges topic vectors which have a cosine distance smaller than
        topic_merge_delta using dbscan. The epsilon parameter of dbscan is
        set to the topic_merge_delta.

    ngram_vocab: bool (Optional, default False)
        Add phrases to topic descriptions.

        Uses gensim phrases to find common phrases in the corpus and adds them
        to the vocabulary.

        For more information visit:
        https://radimrehurek.com/gensim/models/phrases.html

    ngram_vocab_args: dict (Optional, default None)
        Pass custom arguments to gensim phrases.

        For 

In [7]:
model = Top2Vec(documents=docs, 
                speed='deep-learn',
                workers=8) # embedding_model = 'all-MiniLM-L6-v2' or 'universal-sentence-encoder'

2023-07-04 12:24:39,552 - top2vec - INFO - Pre-processing documents for training


2023-07-04 12:25:07,332 - top2vec - INFO - Creating joint document/word embedding
2023-07-04 13:01:37,974 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-04 13:02:50,938 - top2vec - INFO - Finding dense areas of documents
2023-07-04 13:03:01,280 - top2vec - INFO - Finding topics


In [8]:
model.get_num_topics()

224

In [9]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)

[953 749 576 555 474 423 379 338 320 313 298 297 272 271 266 263 262 252
 250 241 220 220 216 214 211 208 206 200 199 196 193 193 188 168 167 163
 160 156 154 153 152 152 151 150 148 147 144 140 139 137 136 136 134 133
 133 131 131 128 128 126 125 121 120 120 120 118 117 116 111 109 109 108
 108 107 107 103 103 102 102 101 100  99  96  96  95  95  95  94  94  93
  92  92  91  91  88  87  87  86  86  85  83  83  80  79  79  79  78  78
  78  78  77  77  77  77  77  76  76  76  76  75  74  73  73  72  71  70
  69  68  68  67  67  67  66  66  66  64  63  62  62  61  61  61  60  60
  59  59  59  58  58  58  57  57  56  55  55  55  54  54  54  54  53  52
  52  52  51  51  51  50  50  50  49  49  48  48  48  47  47  47  47  46
  46  46  46  45  45  45  45  44  44  44  44  44  43  43  43  42  42  42
  42  41  41  41  41  41  40  40  39  38  38  36  36  35  34  33  33  33
  33  32  32  31  30  29  28  27]


In [10]:
sum(topic_sizes), df.shape[0]

(25139, 25139)

In [11]:
print(topic_nums)

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223]


In [26]:
# topic_words, word_scores, topic_nums = model.get_topics()

# for words, scores, topic_num in zip(topic_words, word_scores, topic_nums):
#     print(f"Topic No. {topic_num}")
#     print(f"Words: {words[:15]}\n") # top 15 words in each topic

In [41]:
topic_words, word_scores, topic_nums = model.get_topics(2)

for words, scores, topic_num in zip(topic_words, word_scores, topic_nums):
    print(f"Topic No. {topic_num}")
    print(f"Words: {words}")

Topic No. 0
Words: ['policeman' 'sir' 'angrily' 'man' 'madam' 'taxi' 'drove' 'answered'
 'driver' 'sorry' 'parked' 'sank' 'officer' 'paid' 'thief' 'stanley'
 'waited' 'arrest' 'cane' 'wallet' 'walked' 'assured' 'banker' 'door'
 'merlin' 'buck' 'woke' 'swallowed' 'annie' 'doorway' 'waiter' 'bos'
 'opened' 'wife' 'hurry' 'napoleon' 'burst' 'stopped' 'repaired' 'license'
 'servant' 'faith' 'prayed' 'judging' 'crossing' 'looked' 'pedestrian'
 'beggar' 'stood' 'merry']
Topic No. 1
Words: ['happiness' 'friendship' 'desire' 'positive' 'negative' 'happier'
 'attitude' 'relationship' 'feeling' 'unhappy' 'self' 'confidence'
 'esteem' 'feel' 'accept' 'accomplishment' 'respect' 'honest' 'fail'
 'failure' 'success' 'achieve' 'goal' 'meaningful' 'honesty' 'realize'
 'truly' 'succeed' 'shyness' 'others' 'accepting' 'arise' 'achieving'
 'person' 'understanding' 'pleasure' 'trusted' 'satisfaction' 'accomplish'
 'loyalty' 'unhappiness' 'overcome' 'guilt' 'personality' 'emotion'
 'admit' 'psychologist' '

array(['dinosaur', 'dna', 'fossil', 'evolution', 'evolutionary',
       'extinct', 'skeleton', 'evolved', 'specie', 'ancestor',
       'preserved', 'extinction', 'genetic', 'ape', 'discovery',
       'scientist', 'bone', 'human', 'creature', 'biologist',
       'scientific', 'evidence', 'mankind', 'distinct', 'gene', 'biology',
       'existed', 'asteroid', 'animal', 'theory', 'organism', 'journal',
       'mammal', 'biological', 'origin', 'ancient', 'darwin', 'existence',
       'structure', 'discovered', 'previously', 'ecosystem', 'shark',
       'being', 'proof', 'identical', 'earliest', 'leap', 'observation',
       'reproduce'], dtype='<U15')

In [43]:
TOPIC_NUM = 100
print(f"Topic No.: {TOPIC_NUM}")

print(f"Topic Keywords:\n{model.get_topics(TOPIC_NUM)[0][-1]}\n")
print("---------------")

documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=TOPIC_NUM, num_docs=5)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Topic No.: 100
Topic Keywords:
['dinosaur' 'dna' 'fossil' 'evolution' 'evolutionary' 'extinct' 'skeleton'
 'evolved' 'specie' 'ancestor' 'preserved' 'extinction' 'genetic' 'ape'
 'discovery' 'scientist' 'bone' 'human' 'creature' 'biologist'
 'scientific' 'evidence' 'mankind' 'distinct' 'gene' 'biology' 'existed'
 'asteroid' 'animal' 'theory' 'organism' 'journal' 'mammal' 'biological'
 'origin' 'ancient' 'darwin' 'existence' 'structure' 'discovered'
 'previously' 'ecosystem' 'shark' 'being' 'proof' 'identical' 'earliest'
 'leap' 'observation' 'reproduce']

---------------
Document: 21370, Score: 0.7427197694778442
-----------
walt disney born created mickey mouse made famous died work dream die people world enjoy mickey mouse cartoon walt disney man easily one summer wanted job post office told young went home drew line face put father suit hat went back office told got job finally later life mr disney dream wanted build new kind amusement park would clean beautiful child could play hap

In [47]:
print(df.iloc[10309, 0])

The Magic Kingdom was the first theme park at Walt Disney World, opening in 1971. All Disney World theme parks are open 365 days a year, although opening and closing times for each park are different. If you are traveling without kids, try to visit on a school day to avoid the largest crowds. If you need to visit during a school vacation, try to avoid the week between Christmas and New Year's and the Fourth of July.
If you are not staying at a Disney World hotel, avoid visiting the Magic Kingdom on its Extra Magic Hours days, as Disney's hotel guests get into the park early on those days,  _ wait times for visitors who arrive at the Magic Kingdom's normal opening time.
Buy your Walt Disney World tickets online at Disney World's website. For advice on picking the right ticket, see our guide to Disney World tickets.
You will also need to call in advance to make lunch and/or dinner reservations . Disney accepts reservations, through 1-407-WDW-DINE, up to 180 days in advance. Times do go q

In [48]:
model.save("../output/saved_models/top2vec_model_on_clean_documents")

## Top2Vec Model on Original Documents

In [4]:
docs = df.document.tolist()
print(docs[100])

Over the last 30 years, Bangkok, once a small fishing village, has transformed into a rich, concrete, high-rise city that it is today. The spreading metropolis and its population of 12 million now produces 35 per cent of Thailand's economic wealth.
As a magnet for foreign companies, Bangkok attracts many overseas managers and business people from different fields, including tourism, automobiles and electronics. The city's population of foreigners is in the high hundreds of thousands, with tens of thousands of Japanese, Chinese and western employees working alongside hundreds of thousands of Burmese who mostly do unskilled jobs shunned by Thais.
For those used to the good life, the variety and quality of the city's food is a key attraction, says one US manager, before listing many of his favourite Italian, Mexican and, of course, Thai restaurants. Most offer quality meals for less than the cost of a takeaway sandwich in London.
Great choice and value can be found in Bangkok's other attr

In [5]:
print(Top2Vec.__doc__)


    Top2Vec

    Creates jointly embedded topic, document and word vectors.


    Parameters
    ----------
    documents: List of str
        Input corpus, should be a list of strings.

    min_count: int (Optional, default 50)
        Ignores all words with total frequency lower than this. For smaller
        corpora a smaller min_count will be necessary.

    topic_merge_delta: float (default 0.1)
        Merges topic vectors which have a cosine distance smaller than
        topic_merge_delta using dbscan. The epsilon parameter of dbscan is
        set to the topic_merge_delta.

    ngram_vocab: bool (Optional, default False)
        Add phrases to topic descriptions.

        Uses gensim phrases to find common phrases in the corpus and adds them
        to the vocabulary.

        For more information visit:
        https://radimrehurek.com/gensim/models/phrases.html

    ngram_vocab_args: dict (Optional, default None)
        Pass custom arguments to gensim phrases.

        For 

In [56]:
model = Top2Vec(documents=docs, 
                speed='deep-learn', ngram_vocab=(1,2),
                workers=-1)

2023-07-04 15:07:10,359 - top2vec - INFO - Pre-processing documents for training
2023-07-04 15:08:57,842 - top2vec - INFO - Creating joint document/word embedding
2023-07-04 15:09:54,017 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-04 15:11:01,804 - top2vec - INFO - Finding dense areas of documents
2023-07-04 15:11:08,477 - top2vec - INFO - Finding topics


In [58]:
len(model.vocab)

17963

In [59]:
model.get_num_topics()

59

In [60]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)

[503 484 472 460 459 459 458 458 457 452 451 450 449 448 444 444 444 441
 439 437 436 435 435 435 434 433 430 428 426 422 422 420 419 417 415 413
 413 412 411 410 410 409 408 406 405 405 405 403 398 398 397 396 393 393
 392 390 389 388 379]


In [61]:
topic_words, word_scores, topic_nums = model.get_topics(2)

for words, scores, topic_num in zip(topic_words, word_scores, topic_nums):
    print(f"Topic No. {topic_num}")
    print(f"Words: {words}")

Topic No. 0
Words: ['jurassic park' 'head injury' 'street corner' 'packaging'
 'saturday sunday' 'kopi lowak' 'cognitive function' 'mini riser' 'turner'
 'outstanding' 'jet lag' 'appreciation' 'hunger' 'chooses' 'frenchman'
 'river delta' 'different worlds' 'ship captain' 'sailing' 'hot coals'
 'admitted' 'craft' 'golden' 'non identical' 'express feelings'
 'having trouble' 'deserted' 'mothers' 'illegal immigrants' 'feeling sad'
 'steven' 'co emissions' 'higher scores' 'diet products' 'lowest point'
 'inches taller' 'ms' 'solving math' 'eggs' 'hopes' 'prize' 'centered'
 'paired with' 'most powerful' 'your' 'electricity' 'boiling water'
 'moods' 'restrictions' 'wild koalas']
Topic No. 1
Words: ['drawer' 'hat' 'red yellow' 'pointing out' 'stepped' 'taken aback'
 'brown color' 'final analysis' 'uniform' 'tickets yuan' 'pill'
 'drug addiction' 'sydney' 'autism' 'keep warm' 'very sad'
 'asking permission' 'gaining' 'shut down' 'spoken english'
 'waiting outside' 'motion picture' 'carnegie' 

In [62]:
model.get_num_topics()

59

### See Random Topics and Related Documents

In [71]:
import random

NUM_DOCS = 5 # No. of documents you wanna see for the Topic_Num
TOPIC_NUM = random.randint(0, model.get_num_topics()-1)
print(f"Topic No.: {TOPIC_NUM}")

print(f"Topic Keywords:\n{model.get_topics(TOPIC_NUM+1)[0][-1]}\n")
print("---------------")

documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=TOPIC_NUM, num_docs=NUM_DOCS)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Topic No.: 31
Topic Keywords:
['italians' 'tight' 'tv viewing' 'before bedtime' 'ride bicycle'
 'scientific journal' 'relationship' 'whole family' 'mail addresses'
 'allowed' 'doctors nurses' 'determine how' 'tropical rain' 'curled up'
 'tv' 'american indians' 'experiencing' 'group members' 'most cases'
 'binge drinkers' 'newspapers magazines' 'tom cruise' 'henderson island'
 'properly understood' 'ms runkle' 'discipline' 'mauna loa' 'go shopping'
 'ask questions' 'reading material' 'single sex' 'blows' 'firmly' 'saves'
 'chinese netizens' 'dr smith' 'idiot savant' 'cartoon' 'main purpose'
 'brain cancer' 'announced plans' 'weekend' 'best choice' 'au'
 'specially designed' 'habitual loneliness' 'came back' 'should'
 'presidents' 'magnifying glass']

---------------
Document: 3129, Score: 0.3379296660423279
-----------
Tokyo: The world's oldest man, retired Japanese silkworm breeder Yukichi Chuganji, died in his home at the age of 114, on Monday. Family members found him dead on his mat

In [70]:
model.save(f"../output/saved_models/top2vec_model_on_original_docs_{model.get_num_topics()}Topics")

## Top2Vec Model Using Embedding Models

In [3]:
docs = df.document.tolist()
print(docs[100])

Over the last 30 years, Bangkok, once a small fishing village, has transformed into a rich, concrete, high-rise city that it is today. The spreading metropolis and its population of 12 million now produces 35 per cent of Thailand's economic wealth.
As a magnet for foreign companies, Bangkok attracts many overseas managers and business people from different fields, including tourism, automobiles and electronics. The city's population of foreigners is in the high hundreds of thousands, with tens of thousands of Japanese, Chinese and western employees working alongside hundreds of thousands of Burmese who mostly do unskilled jobs shunned by Thais.
For those used to the good life, the variety and quality of the city's food is a key attraction, says one US manager, before listing many of his favourite Italian, Mexican and, of course, Thai restaurants. Most offer quality meals for less than the cost of a takeaway sandwich in London.
Great choice and value can be found in Bangkok's other attr

In [5]:
# print(Top2Vec.__doc__)

In [6]:
model = Top2Vec(documents=docs, 
                speed='deep-learn',
                embedding_model='all-MiniLM-L6-v2',
                workers=-1) 

# embedding_model = 'all-MiniLM-L6-v2' or 'universal-sentence-encoder'

2023-07-04 15:20:05,573 - top2vec - INFO - Pre-processing documents for training
2023-07-04 15:20:20,230 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2023-07-04 15:20:20,842 - top2vec - INFO - Creating joint document/word embedding
2023-07-04 16:48:22,466 - top2vec - INFO - Creating lower dimension embedding of documents
2023-07-04 16:48:57,965 - top2vec - INFO - Finding dense areas of documents
2023-07-04 16:48:59,764 - top2vec - INFO - Finding topics


In [7]:
model.get_num_topics()

223

In [8]:
topic_sizes, topic_nums = model.get_topic_sizes()
print(topic_sizes)

[601 554 418 349 347 322 321 302 296 267 265 257 256 250 247 240 237 236
 235 233 229 228 223 222 219 217 208 205 201 199 199 198 197 196 192 191
 189 189 185 183 177 176 167 167 160 157 156 156 155 148 146 145 145 144
 142 137 133 131 128 128 127 127 127 127 127 126 126 125 124 124 123 121
 121 121 121 117 116 115 113 112 111 111 110 110 109 109 108 107 107 106
 105 105 104 103 102 102 102 100  99  97  97  93  92  91  89  89  89  89
  88  88  87  86  85  85  85  84  81  81  81  80  79  79  79  79  79  79
  78  77  77  76  76  76  76  75  74  74  74  73  72  71  71  71  71  69
  68  68  68  67  67  66  66  62  62  62  61  61  60  60  59  59  59  58
  57  57  57  57  56  56  56  56  55  55  54  52  52  52  52  51  51  51
  49  49  48  47  47  46  46  46  46  46  45  44  44  43  42  42  42  41
  41  41  39  39  39  37  37  36  36  36  36  35  34  34  33  33  31  29
  27  26  26  26  26  24  22]


In [9]:
topic_words, word_scores, topic_nums = model.get_topics(2)

for words, scores, topic_num in zip(topic_words, word_scores, topic_nums):
    print(f"Topic No. {topic_num}")
    print(f"Words: {words}")

Topic No. 0
Words: ['policeman' 'robber' 'incident' 'jokes' 'detective' 'sentences' 'puzzle'
 'situation' 'funny' 'taxi' 'dickens' 'emergency' 'situations' 'robbed'
 'accident' 'ambulance' 'policemen' 'troubles' 'passenger' 'neighbour'
 'passengers' 'customer' 'police' 'mr' 'laughed' 'stole' 'patient'
 'comedy' 'happened' 'sentence' 'story' 'acted' 'tense' 'suspect' 'ha'
 'travelled' 'thief' 'laugh' 'clerk' 'car' 'circumstances' 'puzzles' 'mrs'
 'joked' 'dialogue' 'behaviour' 'joke' 'waiter' 'beggar' 'stories']
Topic No. 1
Words: ['diet' 'meals' 'diets' 'eating' 'eat' 'nutrition' 'meal' 'obesity'
 'foods' 'eats' 'food' 'nutritional' 'obese' 'lunches' 'healthy'
 'healthier' 'consuming' 'consume' 'hunger' 'unhealthy' 'dinners' 'snack'
 'lunch' 'consumption' 'dining' 'groceries' 'overweight' 'breakfast'
 'feeding' 'fat' 'hungry' 'health' 'recipes' 'nutritious' 'snacks'
 'appetite' 'nutrients' 'lunchtime' 'habits' 'lifestyles' 'fitness' 'fats'
 'consumed' 'vegetables' 'cooking' 'fatty' 'di

In [40]:
import random

NUM_DOCS = 5 # No. of documents you wanna see for the Topic_Num
TOPIC_NUM = random.randint(0, model.get_num_topics()-1)
print(f"Topic No.: {TOPIC_NUM}")

print(f"Topic Keywords:\n{model.get_topics(TOPIC_NUM+1)[0][-1]}\n")
print("---------------")

documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=TOPIC_NUM, num_docs=NUM_DOCS)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Topic No.: 205
Topic Keywords:
['cloning' 'clone' 'reproduce' 'cloned' 'genetically' 'genetic' 'dna'
 'biological' 'donor' 'adopt' 'adoption' 'copying' 'biology' 'transplant'
 'breeding' 'creation' 'genes' 'copy' 'scientific' 'copies' 'darwin'
 'biologist' 'recycling' 'copied' 'chimpanzees' 'gene' 'generation'
 'evolution' 'scientist' 'donors' 'conservation' 'generations' 'debate'
 'organisms' 'artificial' 'adopted' 'endangered' 'invention' 'donate'
 'imitate' 'orphanage' 'animals' 'scientists' 'patent' 'extinction'
 'flesh' 'mice' 'mammals' 'risks' 'innovation']

---------------
Document: 11756, Score: 0.8521753549575806
-----------
The scientific world continues to be amazed by the speed of the development of cloning. Some scientists now suggest that the cloning of humans could occur in the near future. Despite the benefits of cloning, however, certain ethical   questions concerning the possible abuse   of cloning have been raised. At the heart of these questions is the idea of human

In [41]:
model.save(f"../output/saved_models/top2vec_model_on_original_docs_all-MiniLM-L6-v2_{model.get_num_topics()}Topics")