# Import dependencies

In [2]:
import pandas as pd
import ast
import re
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import umap
import hdbscan
import matplotlib.pyplot as plt
import joblib
from openai import OpenAI
from dotenv import load_dotenv
from os import environ
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Read and clean data

In [3]:


def safe_literal_eval(val):
    if pd.isna(val):  
        return []     
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return [] 
    
def clean_text(text):
    # quotes
    text = re.sub(r"(?m)^\s*>.*(?:\r?\n|$)", "", text)

    # links and code
    # should we consider using LLM to generate short summary of the code snippets so we don't lose any context it could provide
    # or could use regex to detect patterns in the code and classify them e.g. detecting import statements, function definitions, or added/removed lines.
    pattern = r"```.*?```|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)

    # keep only alphanumeric characters and punctuation
    cleaned_text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"(){}\[\]\-]", " ", cleaned_text)

    # remove extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text
    
def preprocess_text(comments_sequence):
    all_threads = []
    for comment_thread in comments_sequence:
        main_comment = comment_thread['comment']['body']
        replies = [reply['body'] for reply in comment_thread['replies']]
        thread = main_comment + "\n" + "\n".join(replies)
        
        thread = clean_text(thread)
        if thread != "":
            all_threads.append(thread)

    return all_threads

df= pd.read_csv('data/pull_requests_filtered_raw.csv')
df['comments'] = df['comments'].apply(safe_literal_eval)

df['review_threads'] = df['comments'].apply(lambda comments: [item for item in comments if item['type'] == 'review'] if type(comments) is not float else comments)
df = df[df['review_threads'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df['processed_review_threads'] = df['review_threads'].apply(preprocess_text)


# Extract sentence embeddings

In [4]:
threads = sum(df['processed_review_threads'].tolist(), [])
model = SentenceTransformer('all-MiniLM-L6-v2')

# if False:           # replace if want to regenerate thread embeddings
if Path("cache/thread_embeddings.npy").is_file():
    thread_embeddings = np.load("cache/thread_embeddings.npy")
else:
    thread_embeddings = model.encode(threads, normalize_embeddings=True)
    np.save("cache/thread_embeddings.npy", thread_embeddings)

# Cluster using Basic HDBScan

In [4]:
hdb = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=5, cluster_selection_epsilon=0.1)
# if False:         # replace if want to refit model with new parameters
if Path("cache/hdbscan_model.joblib").is_file():
    hdb = joblib.load("cache/hdbscan_model.joblib")
    labels=hdb.labels_
else:
    labels = hdb.fit_predict(thread_embeddings)
    joblib.dump(hdb, "cache/hdbscan_model.joblib")

## General Topic Info

In [None]:
n_clusters = len(set(labels[labels != -1]))
print(f"Number of cluster: {n_clusters}")
print(len(threads))

valid_clusters = sorted(set(labels[labels != -1]))
cluster_dict = {}

# iterate through clusters
for cluster_id in valid_clusters:  # exclude noise points
        # get all documents belonging to this current cluster
        cluster_docs = [doc for doc, label in zip(threads, labels) if label == cluster_id]
        cluster_dict[cluster_id] = cluster_docs
        
        print(f"\nCluster {cluster_id} ({len(cluster_docs)} documents)")
        print("Sample documents from cluster:")
        for doc in cluster_docs[:3]:
            print(f"- {doc[:100]}...")

# reduce to 2D for visualization
reducer = umap.UMAP()
embedding_2d = reducer.fit_transform(thread_embeddings)

plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=labels, cmap='Spectral')
plt.colorbar()
plt.show()

## Generate Labels for Clusters with OpenAI API

In [None]:
client = OpenAI(api_key=environ.get("OPENAI_API_KEY"))

def generate_cluster_labels(cluster_dict):
    prompt = "I have grouped discussion threads into clusters. Each cluster represents a set of similar topics. Please provide a short, descriptive label (3-5 words) for each cluster based on the sample threads.\n\n"

    for cluster_id, cluster_docs in cluster_dict.items():
        prompt += f"Cluster {cluster_id}:\n"
        prompt += "\n".join(f"- {doc[:200]}" for doc in cluster_docs[:5])  # First few threads per cluster
        prompt += "\n\n"

    prompt += "Provide the labels in this format:\nCluster 0: [Label]\nCluster 1: [Label]\n..."

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You are an expert in topic classification."},
                  {"role": "user", "content": prompt}],
        max_completion_tokens=250,  # adjust based on number of clusters
        temperature=0.2 
    )

    output = response["choices"][0]["message"]["content"]
    cluster_labels = {}

    for line in output.split("\n"):
        if line.startswith("Cluster "):
            parts = line.split(": ")
            if len(parts) == 2:
                cluster_id = int(parts[0].split(" ")[1])  # Extract cluster number
                cluster_labels[cluster_id] = parts[1].strip("[]")  # Extract label

    return cluster_labels

cluster_labels = generate_cluster_labels(cluster_dict)
print(cluster_labels)

# Cluster Using Bertopic

In [5]:
# https://bertopic.readthedocs.io/en/latest/
# https://maartengr.github.io/BERTopic/api/representation/keybert.html
# https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html

# used to fine-tune topic representations
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = umap.UMAP(n_neighbors=15, n_components=5, random_state=42)     # fix topics across runs by setting random_state; otherwise UMAP is stochastic

# using pre-calculated embeddings
# topic_model = BERTopic(min_topic_size=20)
# topics, probs = topic_model.fit_transform(threads, thread_embeddings)

# using KeyBERTInspired to generate embeddings
topic_model = BERTopic(representation_model=representation_model, vectorizer_model=vectorizer_model, umap_model=umap_model, min_topic_size=20)
topics, probs = topic_model.fit_transform(threads)
hierarchical_topics = topic_model.hierarchical_topics(threads)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got f

## General Topic Info

In [6]:
print(topic_model.get_topic_info())
# print(topic_model.generate_topic_labels(nr_words=1))

topic_model.visualize_barchart()

     Topic  Count                                               Name  \
0       -1   9054                   -1_entity_entities_async_devices   
1        0    694  0_sensor_sensors_sensorentitydescription_sensorpy   
2        1    432  1_entitydescription_entitydescriptions_entity_...   
3        2    416        2_reauth_reauthentication_oauth_invalidauth   
4        3    391                    3_dict_dicts_dictget_dictgetkey   
..     ...    ...                                                ...   
185    184     21                184_cache_caching_caches_cachetools   
186    185     21             185_code_commentedout_commented_copied   
187    186     20             186_tilt_positioning_vertical_position   
188    187     20    187_dhcp_homeassistantcomponentsdhcp_gateway_ip   
189    188     20                     188_abort_abortflow_end_reason   

                                        Representation  \
0    [entity, entities, async, devices, config, dev...   
1    [sensor, senso

## Topic Hierarchy

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html

# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
print(topic_model.get_topic_tree(hierarchical_topics))

# merge topics as necessary e.g.
# topics_to_merge = [63, 126, 86, 177, 55, 172, 156, 185]     # all about removing commented out code or stale comments/docstrings
# topic_model.merge_topics(threads, topics_to_merge)


.
├─stale_commentedout_code_commented_comments
│    ├─docstring_docstrings_docstr_stale_doc
│    │    ├─docstring_docstrings_docstr_doc_stale
│    │    │    ├─■──docstring_docstrings_docstr_doc_docblock ── Topic: 63
│    │    │    └─■──docstring_docstrings_stale_romybinarysensor_issue ── Topic: 126
│    │    └─stale_consistent_bad_changed_good
│    │         ├─■──stale_consistent_changed_bad_good ── Topic: 86
│    │         └─■──stale_comment_comments_explain_belong ── Topic: 177
│    └─commentedout_remove_code_commented_removes
│         ├─commentedout_remove_removes_commented_code
│         │    ├─■──commentedout_remove_code_commented_removes ── Topic: 55
│         │    └─stale_code_commented_comments_comment
│         │         ├─■──code_commented_comment_comments_feedback ── Topic: 172
│         │         └─■──stale_code_commented_comment_cleaned ── Topic: 156
│         └─■──code_commentedout_commented_copied_ ── Topic: 185
└─yaml_configuration_config_service_needs
     ├─logging_d

In [8]:
# use this cell to get info about specific topics in hierarchy

[63, 126, 86, 177, 55, 172, 156, 185]

print(topic_model.get_representative_docs(63))
print(topic_model.get_representative_docs(126))
print(topic_model.get_representative_docs(86))
print(topic_model.get_representative_docs(177))
print(topic_model.get_representative_docs(55))
print(topic_model.get_representative_docs(172))
print(topic_model.get_representative_docs(156))
print(topic_model.get_representative_docs(185))

['Please update the docstring.', 'docstring', 'docstring']
['stale docstring Done', 'stale docstring done in', 'Stale docstring?']
['Stale', 'stale name Done', 'stale']
['stale comment', 'Stale comment', 'Stale comment?']
['Please remove commented code', 'Please remove code that is commented out.', 'Please remove any commented out code Done']
['Commented out code', 'commented out code', 'Commented out code']
['Stale commented code', 'Stale commented code', 'Stale commented code']
['Commented-out code.', 'Commented code.', 'Commented code.']


In [12]:
topic_counts = topic_model.get_topic_info()
topic_counts = topic_counts[topic_counts["Topic"] != -1][["Topic", "Count"]]
print(sum(topic_counts['Count']))

13811
