In [1]:
import pandas as pd
import pickle
with open('world_cup_tweets.pkl', 'rb') as f:
    data = pickle.load(f)

In [2]:
data = data.Tweet_processed.to_list()

In [3]:
from umap import UMAP
from hdbscan import HDBSCAN

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

In [30]:
import time
start_time = time.time()

from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True,
    n_gram_range=(1, 2)
)
topics_A, probs_A = model.fit_transform(data)

end_time = time.time()
print("Time taken: ", end_time - start_time, "seconds")

Batches: 100%|██████████| 313/313 [02:07<00:00,  2.45it/s]
2023-03-26 14:23:33,717 - BERTopic - Transformed documents to Embeddings
2023-03-26 14:23:39,021 - BERTopic - Reduced dimensionality
2023-03-26 14:23:39,915 - BERTopic - Clustered reduced embeddings


Time taken:  136.26601099967957 seconds


In [15]:
import time
start_time = time.time()

from bertopic import BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, n_gram_range=(1, 2))
topics_B, probs_B = topic_model.fit_transform(data)

end_time = time.time()
print("Time taken: ", end_time - start_time, "seconds")

Batches: 100%|██████████| 313/313 [02:06<00:00,  2.47it/s]
2023-03-26 13:56:31,844 - BERTopic - Transformed documents to Embeddings
2023-03-26 13:56:42,529 - BERTopic - Reduced dimensionality
2023-03-26 13:56:50,661 - BERTopic - Clustered reduced embeddings


Time taken:  150.38280987739563 seconds


In [31]:
freq_A = model.get_topic_info()

In [41]:
# freq_A

In [33]:
freq_B = topic_model.get_topic_info()

In [42]:
# freq_B[:11]

In [43]:
# model.visualize_barchart(top_n_topics=10)

In [44]:
# topic_model.visualize_barchart(top_n_topics=10)

In [45]:
# model.visualize_hierarchy()

In [46]:
# topic_model.visualize_hierarchy()

In [49]:
df = pd.DataFrame({'topic': topics_A, 'document': data})
for topic in range(10):
    topic_info = model.get_topic(topic)
    representative_docs = df[df.topic == topic][:10]['document'].tolist()
    
    print("Topic: ", topic)
    print("Topic Information: ")
    print(topic_info)
    print("Representative Documents: ")
    print(representative_docs)
    print("\n")

IndentationError: unexpected indent (728242757.py, line 3)

In [50]:
df = pd.DataFrame({'topic': topics_B, 'document': data})
for topic in range(10):
    topic_info = topic_model.get_topic(topic)
    representative_docs = df[df.topic == topic][:10]['document'].tolist()
    
    print("Topic: ", topic)
    print("Topic Information: ")
    print(topic_info)
    print("Representative Documents: ")
    print(representative_docs)
    print("\n")

IndentationError: unexpected indent (2352917094.py, line 3)

In [40]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

# Define a list of parameters to try for UMAP
umap_params = [
    {'n_neighbors': 15, 'n_components': 2, 'min_dist': 0.1},
    {'n_neighbors': 10, 'n_components': 2, 'min_dist': 0.01},
    {'n_neighbors': 3, 'n_components': 2, 'min_dist': 0.001}
]

# Define a list of parameters to try for HDBSCAN
hdbscan_params = [
    {'min_cluster_size': 100, 'min_samples': 100},
    {'min_cluster_size': 50, 'min_samples': 70},
    {'min_cluster_size': 5, 'min_samples': 50}
]

# Loop over the parameter combinations and fit BERTopic models
for umap_param in umap_params:
    for hdbscan_param in hdbscan_params:
        # Create UMAP and HDBSCAN models with the current parameter combination
        umap_model = UMAP(**umap_param)
        hdbscan_model = HDBSCAN(**hdbscan_param, gen_min_span_tree=True, prediction_data=True)

        # Fit a BERTopic model with the current parameter combination
        model = BERTopic(
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            top_n_words=10,
            language='english',
            calculate_probabilities=True,
            verbose=True,
            n_gram_range=(1, 2)
        )
        topics, probs = model.fit_transform(data)

        # Visualize the hierarchy and save the figure to an HTML file
        fig = model.visualize_hierarchy()
        fig.write_html(f'model_umap_{umap_param}_hdbscan_{hdbscan_param}.html')


Batches: 100%|██████████| 313/313 [02:06<00:00,  2.48it/s]
2023-03-26 15:00:21,712 - BERTopic - Transformed documents to Embeddings
2023-03-26 15:00:31,458 - BERTopic - Reduced dimensionality
2023-03-26 15:00:32,787 - BERTopic - Clustered reduced embeddings
Batches: 100%|██████████| 313/313 [02:05<00:00,  2.49it/s]
2023-03-26 15:02:40,510 - BERTopic - Transformed documents to Embeddings
2023-03-26 15:02:49,648 - BERTopic - Reduced dimensionality
2023-03-26 15:02:50,331 - BERTopic - Clustered reduced embeddings
Batches: 100%|██████████| 313/313 [02:11<00:00,  2.38it/s]
2023-03-26 15:05:03,826 - BERTopic - Transformed documents to Embeddings
2023-03-26 15:05:12,959 - BERTopic - Reduced dimensionality
2023-03-26 15:05:13,728 - BERTopic - Clustered reduced embeddings
Batches: 100%|██████████| 313/313 [02:09<00:00,  2.42it/s]
2023-03-26 15:07:25,221 - BERTopic - Transformed documents to Embeddings
2023-03-26 15:07:34,801 - BERTopic - Reduced dimensionality
2023-03-26 15:07:35,680 - BERTopic