In [1]:
#Installing necessary libraries for OpenAlex API and BERTopic pipeline

%pip install bertopic #BERTopic:
%pip install sentence-transformers #Sentence transformers
%pip install spacy #Spacy
%pip install hdbscan #HDBSCAN
%pip install umap-learn #UMAP
%pip install sklearn #SKlearn

Collecting bertopic
  Using cached bertopic-0.16.0-py2.py3-none-any.whl (154 kB)
Collecting numpy>=1.20.0
  Using cached numpy-1.26.2-cp311-cp311-macosx_11_0_arm64.whl (14.0 MB)
Collecting hdbscan>=0.8.29
  Using cached hdbscan-0.8.33-cp311-cp311-macosx_10_9_universal2.whl
Collecting umap-learn>=0.5.0
  Using cached umap-learn-0.5.5.tar.gz (90 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pandas>=1.1.5
  Using cached pandas-2.1.4-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
Collecting scikit-learn>=0.22.2.post1
  Using cached scikit_learn-1.3.2-cp311-cp311-macosx_12_0_arm64.whl (9.4 MB)
Collecting tqdm>=4.41.1
  Using cached tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting sentence-transformers>=0.4.1
  Using cached sentence-transformers-2.2.2.tar.gz (85 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting plotly>=4.7.0
  Using cached plotly-5.18.0-py3-none-any.whl (15.6 MB)
Collecting cython<3,>=0.27
  Using cached Cython-0.29.37-py2.py3-none-any.whl

### The below BERT model was fine-tuned for producing topic representations for the **_abstract_list_clean_en_** corpus.

In [None]:
##Using BERTopic for Topic Modeling
#check supporting documentation: https://maartengr.github.io/BERTopic/index.html

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=5, n_components=3, min_dist=0.2, metric='cosine', low_memory=False)

# Step 3 - Cluster reduced embeddings. lever to adjust number of clusters / topics
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representations with MMR. Decreases redundancy and improves diversity of keywords.
representation_model = MaximalMarginalRelevance(diversity=0.5)

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Create topic representation
  #seed_topic_list = keywords_list,          # Step 5a - Add keyword list based on KeyBERT output & finetuning
  representation_model=representation_model,# Step 6 - (Optional) Fine-tune topic represenation 
  calculate_probabilities=True,        
  verbose=True
)

##Saving the model
model_path = "BERT_AI_Policy_topic_model"
topic_model.save(model_path)

### The below BERT model was fine-tuned for producing topic representations for the **_list_of_abs_** corpus.

In [16]:
##Using BERTopic for Topic Modeling
#check supporting documentation: https://maartengr.github.io/BERTopic/index.html

from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.1, metric='cosine', low_memory=False)

# Step 3 - Cluster reduced embeddings. lever to adjust number of clusters / topics
hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='leaf', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representations with MMR. Decreases redundancy and improves diversity of keywords.
representation_model = MaximalMarginalRelevance(diversity=0.5)

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Create topic representation
  #seed_topic_list = keywords_list,          # Step 5a - Add keyword list based on KeyBERT output & finetuning
  representation_model=representation_model,# Step 6 - (Optional) Fine-tune topic represenation 
  calculate_probabilities=True,        
  verbose=True
)

##Saving the model
model_path = "BERT_AI_Policy_topic_model_reduced"
topic_model.save(model_path)

