In [2]:
import os
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from bertopic.representation import MaximalMarginalRelevance
from transformers import pipeline
from bertopic.representation import TextGeneration
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_parquet("../data/raw/ukraine_tweets_en.parquet")
df.head()

Unnamed: 0,id,author_id,created_at,lang,in_reply_to_user_id,conversation_id,text,reply_settings,possibly_sensitive,retweeted_id,...,expanded_url,mention_name,hashtags,retweet_count,reply_count,like_count,quote_count,username,individual_or_org,category
0,1546962210194325505,1082604507454939137,2022-07-12T20:58:26.000Z,en,,1546962210194325505,RT @RALee85: Video of a damaged Ukrainian M777...,everyone,False,1.5469452227389071e+18,...,https://t.me/wargonzo/7517 https://twitter.com...,RALee85,,113,0,0,0,Archer83Able,Organization,Media
1,1546962286513881090,3028079651,2022-07-12T20:58:44.000Z,en,1.4625489773673595e+18,1546960688899395585,@KyivIndependent Russia just accidentally mana...,everyone,False,,...,,KyivIndependent,,1,1,63,0,mdfzeh,Organization,Other
2,1546962379870801920,60698597,2022-07-12T20:59:07.000Z,en,,1546962379870801920,RT @aaronjmate: For people who miss the old @d...,everyone,False,1.5469067455487263e+18,...,,aaronjmate democracynow unjoe juangon68,,59,0,0,0,Consortiumnews,Organization,Media
3,1546962441573203971,888449203688677377,2022-07-12T20:59:21.000Z,en,,1546962441573203971,"NOW: Separatist🇷🇺 chat in #Lugansk, occupied-U...",everyone,False,,...,https://twitter.com/officejjsmart/status/15469...,,Lugansk,71,19,443,4,officejjsmart,Individual,Media
4,1546962711879319552,556214449,2022-07-12T21:00:26.000Z,en,556214449.0,1546961828277788672,Location seems to be Industrial Area of Luhans...,everyone,False,,...,,,,1,0,15,0,aldin_aba,Individual,Private Individual


In [4]:
# count documents per category
df['category'].value_counts()

category
Media                               310613
Private Individual                  263684
Other                               176778
Activist                             93173
Politics                             55541
International Organization / NGO     17166
Name: count, dtype: int64

In [6]:

df = df.dropna(subset=['text']).query("category == 'Politics'")
documents = df['text'].tolist()
print(f"Number of documents: {len(documents)}")
documents = documents[:10000]
timestamps = df.created_at.to_list()
timestamps = timestamps[:len(documents)]


Number of documents: 55541


In [8]:

# Initialize the BERTopic model
topic_model = BERTopic(
    vectorizer_model=CountVectorizer(ngram_range=(1, 2), stop_words='english'),
    #representation_model=KeyBERTInspired(),
    hdbscan_model = None,

    #hdbscan_model=HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=False),
    verbose=True,
    calculate_probabilities=True,
    language='english',
    embedding_model=SentenceTransformer('all-MiniLM-L6-v2'),
    umap_model=None,
    #dimensionality_reduction_model=BaseDimensionalityReduction(n_components=5),
    ctfidf_model=ClassTfidfTransformer(),
)


In [9]:
# precompute embeddings
embeddings = topic_model.embedding_model.encode(documents, show_progress_bar=True, )
# save embeddings
np.save("embeddings_test.npy", embeddings)

Batches: 100%|██████████| 313/313 [00:09<00:00, 32.07it/s]


In [None]:
# Load precomputed embeddings
embeddings = np.load("embeddings_test.npy")
# select subset of documents for testing
#documents = documents[:1000]
#embeddings = embeddings[:1000]

2025-06-24 18:04:13,598 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


2025-06-24 18:04:16,475 - BERTopic - Dimensionality - Completed ✓
2025-06-24 18:04:16,476 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-24 18:04:16,848 - BERTopic - Cluster - Completed ✓
2025-06-24 18:04:16,850 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-24 18:04:17,226 - BERTopic - Representation - Completed ✓


In [10]:
# pick the hdbscan and umap parameters which improve the relative_validity_ 
max_relative_validity = 0
best_params = None
for n_neighbors in [15, 50]:
    for n_components in [2, 5, 10, 20, 50]:
        umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=0.0, metric='cosine')
        reduced_embeddings = umap_model.fit_transform(embeddings)
        for min_cluster_size in [10, 20, 50, 100, 200, 500]:
            for cluster_selection_method in ['eom', 'leaf']:
                hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method=cluster_selection_method, prediction_data=True, gen_min_span_tree=True)
                relative_validity = hdbscan_model.fit(reduced_embeddings).relative_validity_
                if relative_validity > max_relative_validity:
                    max_relative_validity = relative_validity
                    best_params = (n_neighbors, n_components, min_cluster_size, cluster_selection_method,)
print(f"Best parameters: n_neighbors={best_params[0]}, n_components={best_params[1]}, min_cluster_size={best_params[2]}, cluster_selection_method={best_params[3]}, max_relative_validity={max_relative_validity}")
umap_model = UMAP(n_neighbors=best_params[0], n_components=best_params[1], min_dist=0.0, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=best_params[2], metric='euclidean', cluster_selection_method=best_params[3], prediction_data=True, gen_min_span_tree=True)

Best parameters: n_neighbors=15, n_components=50, min_cluster_size=20, cluster_selection_method=eom, max_relative_validity=0.3250034743468451


In [11]:

# Fit the model to the documents and embeddings
topic_model.umap_model = umap_model
topic_model.hdbscan_model = hdbscan_model
topics, probabilities = topic_model.fit_transform(documents, embeddings=embeddings)

2025-06-24 19:28:54,998 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-24 19:28:58,430 - BERTopic - Dimensionality - Completed ✓
2025-06-24 19:28:58,433 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-24 19:29:01,294 - BERTopic - Cluster - Completed ✓
2025-06-24 19:29:01,296 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-06-24 19:29:01,659 - BERTopic - Representation - Completed ✓


In [12]:
# Visualize the topics
topic_model.visualize_topics()

In [13]:
# visualize the barchart of topics
topic_model.visualize_barchart(top_n_topics=10)

In [14]:
# create a dataframe with topics
topics_df = topic_model.get_topic_info()
topics_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4630,-1_rt_ukraine_https_russia,"[rt, ukraine, https, russia, russian, ukrainia...",[President Biden of Russian President Vladimir...
1,0,449,0_sanctions_gas_oil_russia,"[sanctions, gas, oil, russia, energy, sanction...",[Putin continues to use energy as a weapon.\n\...
2,1,250,1_killed_children_injured_old,"[killed, children, injured, old, died, wounded...",[Prosecutor General's Office daily update: rus...
3,2,238,2_putin_rt_putin power_war,"[putin, rt, putin power, war, hope removing, s...",[@Independent My thoughts on how to penetrate ...
4,3,223,3_grain_food_global_port,"[grain, food, global, port, ports, global food...",[Russian missiles struck Odesa’s port on Satur...
...,...,...,...,...,...
63,62,23,62_losses_combat losses_estimates_indicative,"[losses, combat losses, estimates, indicative,...",[RT @KyivIndependent: These are the indicative...
64,63,23,63_russian navy_navy_sinking moskva_sinking,"[russian navy, navy, sinking moskva, sinking, ...",[@Conflicts Not surprising that the Russian Na...
65,64,23,64_rada_verkhovna rada_verkhovna_rada ukraine,"[rada, verkhovna rada, verkhovna, rada ukraine...",[Speaker of the Verkhovna Rada of #Ukraine @r_...
66,65,22,65_russian comrades_rebel russian_comrades_com...,"[russian comrades, rebel russian, comrades, co...",[@SamRamani2 I think only the Russian people c...


In [None]:

topics_over_time = topic_model.topics_over_time(documents, timestamps,
                                                global_tuning=True, evolution_tuning=True, nr_bins=20)


In [20]:
def count_months_passed(df, col_name):


    df[col_name] = pd.to_datetime(df[col_name])

    min_date = df[col_name].min()
    max_date = df[col_name].max()

# Calculate months difference
    months_passed = (max_date.year - min_date.year) * 12 + (max_date.month - min_date.month)
    return(months_passed)
months_passed = count_months_passed(df, 'created_at')
print(f"Number of months passed: {months_passed}")


Number of months passed: 12


In [16]:
df.head()


Unnamed: 0,id,author_id,created_at,lang,in_reply_to_user_id,conversation_id,text,reply_settings,possibly_sensitive,retweeted_id,...,expanded_url,mention_name,hashtags,retweet_count,reply_count,like_count,quote_count,username,individual_or_org,category
5,1546962934294888449,1424639970,2022-07-12T21:01:19.000Z,en,,1546962934294888449,Selling drones to Russia would be a big win fo...,everyone,False,,...,https://edition.cnn.com/europe/live-news/russi...,,,6,0,33,0,IuliiaMendel,Individual,Politics
38,1546967009493139456,1424639970,2022-07-12T21:17:30.000Z,en,,1546967009493139456,"Last month, Ukrainian prosecutors launched the...",everyone,False,,...,,,,64,4,218,3,IuliiaMendel,Individual,Politics
50,1546968380602830850,1106777071,2022-07-12T21:22:57.000Z,en,1.4625489773673595e+18,1546960688899395585,@KyivIndependent My take on how Soviet hubris ...,everyone,False,,...,https://realcontextnews.com/moscows-1939-finla...,KyivIndependent,,13,4,36,0,bfry1981,Individual,Politics
53,1546968414098538499,1106777071,2022-07-12T21:23:05.000Z,en,,1546968414098538499,My take on how Soviet hubris in Finland in 193...,everyone,False,,...,https://realcontextnews.com/moscows-1939-finla...,,,0,0,1,0,bfry1981,Individual,Politics
74,1546969311130144768,1106777071,2022-07-12T21:26:39.000Z,en,4970411.0,1546966420046635008,@AJEnglish I've been saying for some time that...,everyone,False,,...,https://realcontextnews.com/how-ukraine-can-ta...,AJEnglish,,0,1,3,0,bfry1981,Individual,Politics
