In [53]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN
import numpy as np
import re

In [5]:
cases_df = pd.read_csv('../cases.csv')
cases_sample = cases_df[:10].dropna(subset=['body']).fillna('')
# print(len(cases_sample))

In [7]:
# remove URLs and special texts
def clean_text(text):
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s.,?]', '', text)
    text = text.strip().lower()

    return text

In [8]:
documents = []
for index, row in cases_sample.iterrows():
    joined_text = ' '.join([(row['title']), (row['description']), (row['body'])])
    doc = clean_text(joined_text)
    
    documents.append(doc)

In [39]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents)

In [43]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
vectorizer_model = CountVectorizer(stop_words="english")
# hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [44]:
topic_model = BERTopic(
    embedding_model=embedding_model, 
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    verbose=True)

In [56]:
topics, probs = topic_model.fit_transform(documents, embeddings)
embeddings = topic_model._extract_embeddings(documents)

2025-02-17 16:46:29,739 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-17 16:46:29,755 - BERTopic - Dimensionality - Completed ✓
2025-02-17 16:46:29,755 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-17 16:46:29,757 - BERTopic - Cluster - Completed ✓
2025-02-17 16:46:29,759 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-17 16:46:29,777 - BERTopic - Representation - Completed ✓


In [57]:
np.save('sample_embeddings.npy', embeddings)