Copyright 2025 Province of British Columbia

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and limitations under the License.

In [None]:
# system stuff
import sys
import os

# the usual
import pandas as pd
import numpy as np

# cluster stuff 
from sentence_transformers import SentenceTransformer
import hdbscan # package for density based clustering 
from sklearn.preprocessing import StandardScaler

# human friendly topics
from transformers import pipeline
from keybert import KeyBERT

# dimensionality reduction
from sklearn.manifold import TSNE # good for visuals
import umap

# display stuff
import matplotlib.pyplot as plt
import seaborn as sns

# my stuff
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.config import data_path_rvm, out_folder

In [None]:
df = pd.read_excel(data_path_rvm, sheet_name = 'Q07a')
df = df.iloc[:, 4:-1]
df.columns = ['Response'] + list(df.columns[1:])
labels_original = list(df.columns)[1:]
df.columns = [x.lower().replace(' ','_').replace('/','_').replace(':','_') for x in df.columns]
df = df[~pd.isna(df.response)].reset_index(drop=True)

In [None]:
df.head()

In [None]:
len(df)

In [None]:
# load an embedding model to translate the text to vectors
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# generate embeddings
embeddings = embedding_model.encode(df.response, show_progress_bar=True)

In [None]:
# normalize embeddings for better clustering results
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(embeddings)

In [None]:
# reduce dimensionality before clustering
umap_reducer = umap.UMAP(
    n_components=10,
    n_neighbors=15,
    min_dist=0.1,
    metric='euclidean'
)

reduced_embeddings = umap_reducer.fit_transform(normalized_embeddings)

In [None]:
reduced_embeddings.shape

In [None]:
# Run a density based clustering algorithm
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10,
    min_samples=10,
    cluster_selection_epsilon=0.3,
    metric='euclidean',
    cluster_selection_method='eom',
    core_dist_n_jobs=-1
)

df['cluster'] = clusterer.fit_predict(reduced_embeddings)

In [None]:
df.head()

In [None]:
df.cluster.value_counts().reset_index().sort_values(by='cluster')

In [None]:
# check them out in 2D space using TSNE
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings_tsne = tsne.fit_transform(normalized_embeddings)

In [None]:
# Plot
df_plot = pd.DataFrame(reduced_embeddings_tsne, columns = ['x','y'])
df_plot['cats'] = df.cluster.astype('category')

ax = sns.scatterplot(df_plot, x='x', y='y', hue='cats', alpha=0.7, palette='Set2',legend=False)
#sns.move_legend(ax, 'upper center', bbox_to_anchor=(0.5,-0.1), ncol=8, title=None, frameon=False)
plt.savefig(out_folder+'/cluster_display.png', bbox_inches='tight')
plt.show()

In [None]:
# create some human legible cluster names 
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

In [None]:
def truncate_text(text, max_model_length=1023):
    tokenized_text = summarizer.tokenizer(text, return_tensors='pt', truncation=True, max_length=max_model_length)
    return summarizer.tokenizer.decode(tokenized_text['input_ids'][0], skip_special_tokens=True)

def create_cluster_labels_llm(
    df, 
    chunk_size=50, 
    max_model_length=1023 # truncate texts that are too long
):
    cluster_labels = {}
    n_clusters = len(df['cluster'].unique())
    for cluster_i, cluster in enumerate(df['cluster'].unique()):
        # get only the responses associated with a given cluster
        cluster_texts = df[df['cluster']==cluster]['response'].tolist()
        
        chunk_summaries = []
        # LLMs typically have 1024-2048 char limits, so chunking each cluster and then summarizing the summaries
        n_texts = len(cluster_texts)
        n_chunks = int(n_texts/chunk_size) + 1
        for chunk_i, i in enumerate(range(0, len(cluster_texts), chunk_size)):
            print(f'Summarizing Cluster {cluster_i+1:03,}/{n_clusters:03,} --- Completing Chunk {chunk_i+1:04,}/{n_chunks:04,}', end='\r')
            chunk = " ".join(cluster_texts[i: i+chunk_size])
            summary = summarizer(truncate_text(chunk, max_model_length), max_length=30, min_length=1, do_sample=False)[0]['summary_text']
            chunk_summaries.append(summary)
            
        
        final_summary = summarizer(
            truncate_text(" ".join(chunk_summaries)), 
            max_length=20, min_length=1, do_sample=False)[0]['summary_text']
        cluster_labels[cluster] = final_summary
        
    return cluster_labels

In [None]:
cluster_labels = create_cluster_labels_llm(df)

In [None]:
df['cluster_label'] = df['cluster'].map(cluster_labels)

In [None]:
# keyword method of creating topics

# Load KeyBERT for keyword extraction
kw_model = KeyBERT(embedding_model)

def create_cluster_labels_keyword(
    df, 
    chunk_size=50, 
    max_model_length=1023 # truncate texts that are too long
):
    cluster_labels = {}
    n_clusters = len(df['cluster'].unique())
    for cluster_i, cluster in enumerate(df['cluster'].unique()):
        print(f'Completing Cluster {cluster_i:03,}/{n_clusters:03,}', end='\r')
        # get only the responses associated with a given cluster
        cluster_texts = df[df['cluster']==cluster]['response'].tolist()
        
        joined_text = " ".join(cluster_texts)
        keywords = kw_model.extract_keywords(joined_text, keyphrase_ngram_range=(1,2), stop_words='english', top_n=5)
        cluster_labels[cluster] = ", ".join([kw[0] for kw in keywords])
        
    return cluster_labels

In [None]:
cluster_labels = create_cluster_labels_keyword(df)

In [None]:
df['cluster_keyword'] = df['cluster'].map(cluster_labels)

In [None]:
df.groupby(['cluster', 'cluster_label', 'cluster_keyword']).apply(lambda x: x.sample(2))

In [None]:
# save cluster results for later viewing
df.sort_values(by='cluster').to_csv(out_folder+'/rbcm_q7_cluster_results.csv', index=False)
(df
 .groupby(['cluster', 'cluster_label', 'cluster_keyword'])
 .response.count()
 .reset_index()
 .sort_values(by='response', ascending=False)
 .to_csv(out_folder+'/rbcm_q7_cluster_counts.csv', index=False)
)