In [52]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import numpy as np
import re
import pickle

In [58]:
cases_df = pd.read_csv('../cases.csv')
cases_sample = cases_df[:10].dropna(subset=['body']).fillna('')
index_to_case = {i: row for i, row in enumerate(cases_sample.index)}

In [60]:
# with open("index_to_case.pkl", "wb") as f:
#     pickle.dump(index_to_case, f)

In [34]:
# Remove URLs and special texts
def clean_text(text):
    """ 
    Remove URLs, special characters, white space, and lower case

    :param text: str
    :return: str
    """

    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s.,?]', '', text)
    text = text.strip().lower()
    return text

In [35]:
# Extract relevant text columns, and clean text to create document
documents = []
for index, row in cases_sample.iterrows():
    text = '-'.join([(row['title']), (row['description']), (row['body'])])
    document = clean_text(text)
    documents.append(document)

In [39]:
# Pre-calculating embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents)

In [47]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

In [48]:
bertopic_model = BERTopic(
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    verbose=True)

In [49]:
topics, probs = bertopic_model.fit_transform(documents, embeddings)

2025-02-17 20:52:45,152 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-02-17 20:52:45,170 - BERTopic - Dimensionality - Completed ✓
2025-02-17 20:52:45,170 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-02-17 20:52:45,172 - BERTopic - Cluster - Completed ✓
2025-02-17 20:52:45,174 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-02-17 20:52:45,211 - BERTopic - Representation - Completed ✓


In [51]:
np.save('sample_case_embeddings.npy', embeddings)