# **IMPORTS**

In [1]:
import pandas as pd
from bertopic import BERTopic
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

  axis.set_ylabel('$\lambda$ value')
  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


# **Dataset**

In [52]:
# load data

df = pd.read_csv('clean_genai-education_2023-2025.csv')

In [53]:
# extract text data from dataframe and convert to list

documents = df['text_clean'].tolist()

# **BERTopic Initialization**

In [41]:
bertopic_model = BERTopic(
    vectorizer_model=TfidfVectorizer(
        max_features=1000,
        max_df=0.95,
        min_df=2,
        ngram_range=(1, 2),
        stop_words='english'),
    calculate_probabilities=True,
    min_topic_size=3,
    verbose=True,
)

topics, probabilities = bertopic_model.fit_transform(documents)

2025-09-20 13:41:49,230 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

2025-09-20 13:41:50,924 - BERTopic - Embedding - Completed ✓
2025-09-20 13:41:50,925 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-20 13:41:51,686 - BERTopic - Dimensionality - Completed ✓
2025-09-20 13:41:51,687 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-20 13:41:51,737 - BERTopic - Cluster - Completed ✓
2025-09-20 13:41:51,740 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-20 13:41:51,815 - BERTopic - Representation - Completed ✓


In [54]:
# Extract topics and their corresponding words
topics = bertopic_model.get_topics()
topic_words = [[word for word, _ in bertopic_model.get_topic(topic)] for topic in range(len(set(topics)) -1)]

# Preprocess documents for Gensim
preprocessed_docs = [doc.split() for doc in documents]

# Create a Gensim dictionary and corpus
dictionary = Dictionary(preprocessed_docs)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

# Calculate the coherence score
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=preprocessed_docs,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

print(f"The number of topics is: {len(set(topics)) - 1}")
print(f"The coherence score is: {coherence_score}")

The number of topics is: 6
The coherence score is: 0.3387786840246121


In [55]:
# Define a range of min_topic_size values to test
min_topic_size_range = range(5, 25, 1)

results = []

for size in min_topic_size_range:
    # Create a new BERTopic model with the current min_topic_size
    bertopic_model = BERTopic(min_topic_size=size)

    # Fit the model and get the topics
    topics, probabilities = bertopic_model.fit_transform(documents)

    # Get topic words
    topic_words = [[word for word, _ in bertopic_model.get_topic(topic_id)] for topic_id in bertopic_model.get_topics()]

    # Prepare documents for Gensim
    preprocessed_docs = [doc.split() for doc in documents]
    dictionary = Dictionary(preprocessed_docs)

    # Calculate coherence score
    coherence_model = CoherenceModel(
        topics=topic_words,
        texts=preprocessed_docs,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()

    num_topics = len(set(topics)) -1

    results.append({
        'min_topic_size': size,
        'num_topics': num_topics,
        'coherence_score': coherence_score
    })

    print(f"min_topic_size: {size}, Number of Topics: {num_topics}, Coherence Score: {coherence_score}")

# Find the best result
best_result = max(results, key=lambda x: x['coherence_score'])

print("\n--- Best Result ---")
print(f"Optimal min_topic_size: {best_result['min_topic_size']}")
print(f"Resulting Number of Topics: {best_result['num_topics']}")
print(f"Coherence Score: {best_result['coherence_score']:.4f}")

min_topic_size: 5, Number of Topics: 18, Coherence Score: 0.3745907318307412
min_topic_size: 6, Number of Topics: 14, Coherence Score: 0.3536679641439258
min_topic_size: 7, Number of Topics: 13, Coherence Score: 0.33360033506799314
min_topic_size: 8, Number of Topics: 11, Coherence Score: 0.36436182699114256
min_topic_size: 9, Number of Topics: 8, Coherence Score: 0.36332652563771656
min_topic_size: 10, Number of Topics: 7, Coherence Score: 0.37855623536284966
min_topic_size: 11, Number of Topics: 7, Coherence Score: 0.371755269973291
min_topic_size: 12, Number of Topics: 6, Coherence Score: 0.36594527281885075
min_topic_size: 13, Number of Topics: 5, Coherence Score: 0.38967159290700176
min_topic_size: 14, Number of Topics: 5, Coherence Score: 0.3860095316661953
min_topic_size: 15, Number of Topics: 5, Coherence Score: 0.3970073880777549
min_topic_size: 16, Number of Topics: 3, Coherence Score: 0.40598506716151195
min_topic_size: 17, Number of Topics: 5, Coherence Score: 0.38370699796

In [57]:
best_min_topic_size = best_result['min_topic_size']

# Initialize and fit the final, optimized model
bertopic_model = BERTopic(
    vectorizer_model=TfidfVectorizer(max_features=1000, max_df=0.95, min_df=2, ngram_range=(1, 2), stop_words='english'),
    calculate_probabilities=True,
    min_topic_size=best_result['min_topic_size'],
    verbose=True,
)

topics, probabilities = bertopic_model.fit_transform(documents)

print(f"Final model created with min_topic_size = {best_min_topic_size}")
print(f"Number of topics in the final model: {len(set(topics)) - 1}")

2025-09-20 13:51:50,759 - BERTopic - Embedding - Transforming documents to embeddings.
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

2025-09-20 13:51:53,585 - BERTopic - Embedding - Completed ✓
2025-09-20 13:51:53,587 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-20 13:51:54,366 - BERTopic - Dimensionality - Completed ✓
2025-09-20 13:51:54,367 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-20 13:51:54,392 - BERTopic - Cluster - Completed ✓
2025-09-20 13:51:54,396 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-20 13:51:54,454 - BERTopic - Representation - Completed ✓


Final model created with min_topic_size = 16
Number of topics in the final model: 5


# **BERTopic Visualization**

In [58]:
print("\nMost frequent topics:")
bertopic_model.get_topic_info()


Most frequent topics:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,139,-1_ai_kids_think_teachers,"[ai, kids, think, teachers, use ai, generative...",[teaching kids how to use AI to better prepare...
1,0,136,0_ai_ai education_use ai_language models,"[ai, ai education, use ai, language models, la...",[I appreciate the potential of AI technology t...
2,1,79,1_generative ai_generative_ai_use generative,"[generative ai, generative, ai, use generative...",[G-7 education ministers confirm need to curb ...
3,2,40,2_chatgpt_gpt_essays_grammarly,"[chatgpt, gpt, essays, grammarly, essay, chat,...",[Think Uni sjould habe a harsh stance on Chat ...
4,3,33,3_ai_technologies_ban_university,"[ai, technologies, ban, university, think, way...",[I think it is an important matter as AI will ...
5,4,21,4_valuable_materials_working_great,"[valuable, materials, working, great, things, ...",[think it's a great tutoring tool. It can teac...


In [59]:
bertopic_model.visualize_barchart()

In [63]:
bertopic_model.visualize_heatmap()

In [64]:
bertopic_model.visualize_hierarchy()

In [65]:
bertopic_model.visualize_term_rank()