In [None]:
#Installing necessary libraries for OpenAlex API and BERTopic pipeline

%pip install bertopic #BERTopic:
%pip install sentence-transformers #Sentence transformers
%pip install spacy #Spacy
%pip install hdbscan #HDBSCAN
%pip install umap-learn #UMAP
%pip install sklearn #SKlearn

In [None]:
##Using BERTopic for Topic Modeling
#check supporting documentation: https://maartengr.github.io/BERTopic/index.html

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=5, n_components=3, min_dist=0.2, metric='cosine', low_memory=False)

# Step 3 - Cluster reduced embeddings. lever to adjust number of clusters / topics
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representations with MMR. Decreases redundancy and improves diversity of keywords.
representation_model = MaximalMarginalRelevance(diversity=0.5)

# All steps together
topic_model = BERTopic(
  #min_topic_size = 10,
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Create topic representation
  #seed_topic_list = keywords_list,          # Step 5a - Add keyword list based on KeyBERT output & finetuning
  representation_model=representation_model,# Step 6 - (Optional) Fine-tune topic represenation 
  calculate_probabilities=True,        
  verbose=True
)

##Saving the model
model_path = "BERT_AI_Policy_topic_model"
topic_model.save(model_path)