# https://medium.com/geekculture/nlp-tutorial-topic-modeling-in-python-with-bertopic-da760e1d03aa

In [8]:
# ! pip install bertopic

In [9]:
# ! pip install bertopic[visualization]

In [41]:
import pandas as pd
from bertopic import BERTopic
import nltk  
nltk.download('stopwords')
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings
from collections import defaultdict
import string                              # for string operations

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caiomiyashiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df = pd.read_csv('webscraper/scrape_data/tweets_sample.tsv', sep='\t')
df.head()

Unnamed: 0,Tweet ID,Country,Date,Tweet,Definitely English,Ambiguous,Definitely Not English,Code-Switched,Ambiguous due to Named Entities,Automatically Generated Tweets
0,434215992731136000,TR,2014-02-14,Bugün bulusmami lazimdiii,0,0,1,0,0,0
1,285903159434563584,TR,2013-01-01,Volkan konak adami tribe sokar yemin ederim :D,0,0,1,0,0,0
2,285948076496142336,NL,2013-01-01,Bed,1,0,0,0,0,0
3,285965965118824448,US,2013-01-01,I felt my first flash of violence at some fool...,1,0,0,0,0,0
4,286057979831275520,US,2013-01-01,Ladies drink and get in free till 10:30,1,0,0,0,0,0


In [74]:
df_english = df.loc[df['Definitely English'] == 1]
X = df_english['Tweet'].str.lower().to_list()

In [78]:
tweet_tokenizer = TweetTokenizer()

tweets_tokens = []
for x in X:
    tweets_tokens.append(tweet_tokenizer.tokenize(x))

for ix in range(len(x[:10])):
    print(f'original: {X[ix]}')
    print(f'tokenized {tweets_tokens[ix]}')
    print('\n')

original: bed
tokenized ['bed']


original: i felt my first flash of violence at some fool who bumped into me.... i pity the fool.
tokenized ['i', 'felt', 'my', 'first', 'flash', 'of', 'violence', 'at', 'some', 'fool', 'who', 'bumped', 'into', 'me', '...', 'i', 'pity', 'the', 'fool', '.']


original: ladies drink and get in free till 10:30
tokenized ['ladies', 'drink', 'and', 'get', 'in', 'free', 'till', '10:30']


original: watching #miranda on bbc1!!! @mermhart u r hilarious ❤💋
tokenized ['watching', '#miranda', 'on', 'bbc', '1', '!', '!', '!', '@mermhart', 'u', 'r', 'hilarious', '❤', '💋']


original: shopping! (@ kohl's) http://t.co/i8zkqht9
tokenized ['shopping', '!', '(', '@', "kohl's", ')', 'http://t.co/i8zkqht9']


original: yessss ^_^
tokenized ['yessss', '^', '_', '^']


original: @dennycrowe all over twitter because you and your friends cant stick up for yourselves.
tokenized ['@dennycrowe', 'all', 'over', 'twitter', 'because', 'you', 'and', 'your', 'friends', 'cant', 'stick'

In [79]:
#Import the english stop words list from NLTK
stopwords_english = set(stopwords.words('english'))

print('Stop words\n')
print(stopwords_english)

print('\nPunctuation\n')
print(string.punctuation)

Stop words

{'but', "needn't", 'where', 'them', 'over', 'being', 'can', 'yours', 'am', 'shouldn', 'this', 'themselves', 'which', 'too', "you'd", "doesn't", 'been', 'm', "she's", 'couldn', 'into', "couldn't", 'the', 'more', 'you', 'these', 'yourselves', 'an', 'shan', 'it', 'between', 'on', "shouldn't", 'how', 'wasn', 'won', 'needn', 'out', "shan't", 'those', 'what', 'if', 'nor', 'do', 'did', "won't", "weren't", 'aren', 'some', 'before', 'to', 'most', 'he', 'above', 'same', 'didn', 'y', 'again', 'had', 'does', 'with', 'just', 'o', 'each', 'hasn', 'are', "that'll", 'i', 'that', "hasn't", 'herself', "it's", 'd', 'of', 'ain', 'doesn', 're', 'a', "you'll", 'ourselves', 'our', 'after', 'have', "aren't", 'while', "mustn't", 'why', "mightn't", 'is', 'me', 'than', 'about', 'own', 'so', 'should', 'very', 'my', 'few', 'such', "should've", 'was', "wasn't", 'haven', 'not', 'll', 've', "didn't", 'down', "haven't", 'up', 'hers', 'off', 'there', 'don', 'itself', 'we', 'yourself', 'your', 'doing', 'him'

In [80]:
tweets_tokens_filtered = []
for ix, tweet in enumerate(tweets_tokens):
    tweets_tokens_filtered.append([])
    for token in tweet:
        if token not in stopwords_english and token not in string.punctuation:
            tweets_tokens_filtered[ix].append(token)

joined_tokens = []        
for twitter_token_list in tweets_tokens_filtered:
    joined_tokens.append(' '.join(twitter_token_list))

In [89]:
from collections import Counter

all_tokens = []
for token_list in tweets_tokens_filtered:
    all_tokens.extend(token_list)
all_tokens[:10]

['bed',
 'felt',
 'first',
 'flash',
 'violence',
 'fool',
 'bumped',
 '...',
 'pity',
 'fool']

In [91]:
Counter(all_tokens)

Counter({'bed': 16,
         'felt': 5,
         'first': 38,
         'flash': 3,
         'violence': 1,
         'fool': 5,
         'bumped': 1,
         '...': 254,
         'pity': 1,
         'ladies': 6,
         'drink': 10,
         'get': 130,
         'free': 24,
         'till': 13,
         '10:30': 1,
         'watching': 29,
         '#miranda': 1,
         'bbc': 1,
         '1': 33,
         '@mermhart': 1,
         'u': 88,
         'r': 18,
         'hilarious': 4,
         '❤': 62,
         '💋': 13,
         'shopping': 17,
         "kohl's": 1,
         'http://t.co/i8zkqht9': 1,
         'yessss': 1,
         '@dennycrowe': 1,
         'twitter': 12,
         'friends': 18,
         'cant': 6,
         'stick': 3,
         "i'm": 1018,
         'falling': 18,
         'apart': 3,
         'broken': 2,
         'heart': 21,
         'barely': 4,
         'breathing': 1,
         '@cp_udashit': 1,
         'oh': 42,
         'god': 24,
         'go': 105,
         

In [83]:
model = BERTopic(verbose=True)
topics, probabilities = model.fit_transform(joined_tokens)

Batches:   0%|          | 0/159 [00:00<?, ?it/s]

2023-04-10 17:58:57,781 - BERTopic - Transformed documents to Embeddings
2023-04-10 17:59:03,259 - BERTopic - Reduced dimensionality
2023-04-10 17:59:03,514 - BERTopic - Clustered reduced embeddings


In [84]:
model.get_topic_freq().head(11)

Unnamed: 0,Topic,Count
0,-1,1884
1,0,849
2,1,329
3,2,104
4,3,103
5,4,85
6,5,84
7,6,78
8,7,78
9,8,64


In [85]:
model.get_topic(0)

[('lol', 0.018472163989407184),
 ('haha', 0.018183742849385585),
 ('love', 0.014563644726876034),
 ('follow', 0.013890040907755342),
 ('ill', 0.011107293875689855),
 ('get', 0.01106124873401852),
 ('like', 0.010932284215015148),
 ('good', 0.01074667564153429),
 ('please', 0.009694467292400928),
 ('never', 0.008885835100551883)]

In [86]:
model.visualize_topics()

In [87]:
model.visualize_barchart()

In [88]:
model.visualize_heatmap()

# Getting word, sentence and documents embeddings

https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/ <-- contains deprecated methods
* more updates here: https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md

## Word Embedding

In [15]:
# from flair.embeddings import BertEmbeddings # had to update to numpy 1.22 (< 1.23)
from flair.embeddings import TransformerWordEmbeddings, TransformerDocumentEmbeddings, Transformer
from flair.data import Sentence


In [8]:
# init embedding
embedding = TransformerWordEmbeddings('bert-base-uncased')

# create a sentence
sentence = Sentence('The grass is green .')

# embed words in sentence
embedding.embed(sentence)

[Sentence[5]: "The grass is green ."]

In [11]:
# now check out the embedded tokens.
for token in sentence:
    print(token)
    print(token.embedding)

Token[0]: "The"
tensor([-3.2318e-02, -3.9037e-01, -1.1946e+00,  1.2959e-01,  5.8057e-01,
        -8.4713e-02, -4.5202e-01,  1.3699e+00,  3.8501e-01, -6.1318e-01,
        -3.2455e-01, -9.8989e-01, -6.8972e-01,  2.7537e-01, -5.8666e-01,
         2.3986e-01,  5.9567e-02,  1.7474e-01, -2.6166e-03,  2.0810e-01,
        -3.0275e-01, -5.1784e-01, -7.6839e-01,  5.9653e-01,  2.2556e-01,
         2.1899e-01,  9.6028e-01,  3.9233e-01,  6.9992e-01, -6.3882e-02,
        -5.0272e-01,  1.8365e-01, -9.1203e-01, -7.9365e-02, -3.0147e-01,
         5.0364e-02, -4.4785e-01,  3.6136e-01, -3.9016e-01,  3.3766e-01,
        -2.9918e-01, -1.4879e-01,  8.0483e-01,  2.2012e-02,  8.1105e-01,
        -1.3799e+00,  2.8989e-01, -4.8946e-01, -3.1791e-01, -5.1490e-01,
         5.3774e-02,  1.5167e-01,  1.2746e-01,  6.4179e-01,  5.3434e-01,
         1.3347e+00, -4.8003e-04,  9.2838e-01,  5.0098e-01,  3.4710e-01,
        -2.4986e-01,  2.4073e-02,  7.8260e-01, -7.3836e-01, -6.3832e-04,
         3.6023e-01, -4.5780e-01,  

In [14]:
sentence[0].embedding.shape

torch.Size([768])

## Sentence Embedding

In [17]:
# init embedding
embedding = TransformerDocumentEmbeddings('bert-base-uncased')

# create a sentence
sentence = Sentence("The grass is green. The neighbor's grass is greener.")

# embed words in sentence
embedding.embed(sentence)

[Sentence[12]: "The grass is green. The neighbor's grass is greener."]

## Document Embedding

In [17]:
# init embedding
embedding = TransformerDocumentEmbeddings('bert-base-uncased')

# create a sentence
sentence = Sentence("The grass is green. The neighbor's grass is greener.")

# embed words in sentence
embedding.embed(sentence)

[Sentence[12]: "The grass is green. The neighbor's grass is greener."]

In [21]:
sentence.embedding.shape

torch.Size([768])