# Import dependencies

In [38]:
import pandas as pd
import ast
import re
import umap
from dotenv import load_dotenv
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer

load_dotenv()

True

# Read and clean data

In [39]:


def safe_literal_eval(val):
    if pd.isna(val):  
        return []     
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return [] 
    
def clean_text(text):
    # quotes
    text = re.sub(r"(?m)^\s*>.*(?:\r?\n|$)", "", text)

    # links and code
    # should we consider using LLM to generate short summary of the code snippets so we don't lose any context it could provide
    # or could use regex to detect patterns in the code and classify them e.g. detecting import statements, function definitions, or added/removed lines.
    pattern = r"```.*?```|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)

    # keep only alphanumeric characters and punctuation
    cleaned_text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"(){}\[\]\-]", " ", cleaned_text)

    # remove extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text
    
def preprocess_text(comments_sequence):
    all_threads = []
    for comment_thread in comments_sequence:
        main_comment = comment_thread['comment']['body']
        replies = [reply['body'] for reply in comment_thread['replies']]
        thread = main_comment + "\n" + "\n".join(replies)
        
        thread = clean_text(thread)
        if thread != "":
            all_threads.append(thread)

    return all_threads

df= pd.read_csv('data/pull_requests_filtered_raw.csv')
df['comments'] = df['comments'].apply(safe_literal_eval)

df['review_threads'] = df['comments'].apply(lambda comments: [item for item in comments if item['type'] == 'review'] if type(comments) is not float else comments)
df = df[df['review_threads'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df['processed_review_threads'] = df['review_threads'].apply(preprocess_text)

threads = sum(df['processed_review_threads'].tolist(), [])


# Cluster Using Bertopic

In [40]:
# https://bertopic.readthedocs.io/en/latest/
# https://maartengr.github.io/BERTopic/api/representation/keybert.html
# https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html

# used to fine-tune topic representations
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = umap.UMAP(n_neighbors=15, n_components=5, random_state=42)     # fix topics across runs by setting random_state; otherwise UMAP is stochastic

# using pre-calculated embeddings
# topic_model = BERTopic(min_topic_size=20)
# topics, probs = topic_model.fit_transform(threads, thread_embeddings)

# using KeyBERTInspired to generate embeddings
topic_model = BERTopic(representation_model=representation_model, vectorizer_model=vectorizer_model, umap_model=umap_model, min_topic_size=20)
topics, probs = topic_model.fit_transform(threads)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## General Topic Info

In [4]:
# set Pandas formatting for printing
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', None)

In [None]:
print(topic_model.get_topic_info())

# topic_model.visualize_barchart()

## Topic Hierarchy

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html

# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
hierarchical_topics = topic_model.hierarchical_topics(threads)
print(topic_model.get_topic_tree(hierarchical_topics))



In [None]:
topic = 38
topics_df = pd.DataFrame({'topic': topics, 'document': threads})

# imports_cluster = [38, 75]

# +from .. import ads (from reviewer) [2024-09-10T13:50:17Z] Don't do this, rather just import what you need    NOTE: should use relative within component code and 
# importing private methods


print(topic_model.get_topic_info(topic))
print(topics_df[topics_df.topic == topic])


   Topic  Count                                  Name  \
0     38     90  38_import_importing_imports_imported   

                                                                              Representation  \
0  [import, importing, imports, imported, exported, unpack, package, unpacking, files, need]   

                                                              Representative_Docs  
0  [There is nothing to import., and please import those, Please import this one]  
       topic  \
42        38   
547       38   
1277      38   
1375      38   
1523      38   
1529      38   
1657      38   
1827      38   
2151      38   
2181      38   
2735      38   
2947      38   
3134      38   
3168      38   
3596      38   
3637      38   
3999      38   
4604      38   
5257      38   
5635      38   
5637      38   
5645      38   
5835      38   
5861      38   
5863      38   
5925      38   
5986      38   
6226      38   
7644      38   
7680      38   
8161      38   
8165      38

In [37]:
# use this cell to get info about specific topics in hierarchy
# print(topic_model.get_representative_docs(65))

pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', None)

topics_df = pd.DataFrame({'topic': topics, 'document': threads})
print(len(topics_df[topics_df.topic == 13]))
print(topics_df[topics_df.topic == 13])

194
       topic  \
24        13   
179       13   
358       13   
380       13   
415       13   
655       13   
896       13   
1131      13   
1189      13   
1221      13   
1232      13   
1457      13   
1462      13   
1463      13   
1544      13   
1613      13   
1615      13   
1757      13   
1882      13   
1970      13   
1976      13   
2179      13   
2202      13   
2820      13   
3073      13   
3496      13   
3501      13   
3555      13   
3909      13   
4038      13   
4082      13   
4205      13   
4570      13   
4640      13   
4671      13   
4714      13   
5333      13   
5580      13   
5603      13   
5622      13   
5626      13   
5690      13   
5726      13   
5800      13   
5826      13   
5912      13   
5913      13   
5943      13   
6039      13   
6061      13   
6116      13   
6236      13   
6331      13   
6377      13   
6637      13   
6638      13   
6813      13   
7073      13   
7075      13   
7078      13   
7151      13   
7236

## Merge Topics

In [None]:
typing_casting_cluster = [139, 36, 123, 107, 53, 105, 87]        
redundant_code_comments_cluster = [85, 153, 113, 167, 15, 106, 181, 148, 47, 138, 63, 126, 86, 177, 55, 172, 156, 185, 96]
fixed_cluster = [132, 180]      # 'Fixed in ...'
code_style_cluster = [128, 176, 130, 110, 52, 91, 19, 98, 30, 103, 109, 13, 42, 70, 89, 127, 114, 99, 170, 79, 144, 166, 168, 71, 76, 21, 90, 59]       # indentation, if statement styles, moving checks, single-use variables, renaming, using const, linting
thanks_acknowledgements_cluster = [39, 46]
process_cluster = [162, 12, 45, 73, 56]      # dependency bumps in different PR, "do in another PR", "revert change/not needed", rebasing, limit to one platform per PR
misc_cluster = [152, 175, 82, 173, 97, 74, 152, 37]  # "same here", "done", "same as above" etc
imports_cluster = [38, 75]  # e.g. https://github.com/home-assistant/core/pull/77091#discussion_r950871893 and https://github.com/home-assistant/core/pull/49116#discussion_r611703022
config_entries_cluster = [145, 149, 77, 83, 101, 140, 188, 187, 17, 9, 88, 163, 44, 35]     # including move away from YAML
async_eventloop_cluster = [81, 48, 64, 125, 34]
error_handling_cluster = [104, 72, 151, 40, 24, 150, 124]
logging_cluster = [122, 60, 31, 131]
measurements_cluster = [29, 129, 157]    # unit of measurements, sensor measurements, power/watts/energy
icon_name_device_translations_cluster = [14, 67, 4]
time_timezone_dates_duration_cluster = [16, 95]
classes_inheritance_attributes_cluster = [23, 171, 142]
dictionary_dictkeys_cluster = [43, 3, 158]
update_coordinator_interval_polling_cluster = [18, 116, 93, 6]
testing_cluster = [119, 174, 32, 51, 11, 160, 94, 178, 111, 102, 182]
external_library_cluster = [136, 133, 57]   # this is also present partially in other clusters
unique_ids_cluster = [65, 33]
assigning_attributes_cluster = [143, 27, 92, 80, 179]
entity_cluster = [1, 164, 137, 118]
protocols_cluster = [165, 66, 115, 134, 146, 61, 68]
device_cluster = [25, 49]
validation_schemas_cluster = [155, 78, 28]
blueprints_cluster = [135]      # use a select selector for blueprint
state_cluster = [7, 169, 100]
domain_cluster = [141]
service_cluster = [54]
authentication_cluster = [2]
voice_assistant_conversation_cluster = [154, 159]
questions_cluster = [121]
other_cluster = [186, 120, 161, 147, 117, 20, 41, 184, 147, 183, *fixed_cluster, *misc_cluster, *blueprints_cluster]

# remaining topics included in other:
# 183 = Tuya (brand) devices
# 184 = caching, cache decorator
# 186 = comments about tilt, cover, ... of blinds- basically all from https://github.com/home-assistant/core/pull/48625
# 120 = random number string noise
# 147 = minValue, maxValue, min max
# 161 = mixins
# 117 = general "update" stuff
# 20 = bunch of mixed things involving Home Assistant
# 41 = mixed bag of integration related items

# remaining topics in own cluster
# 8 = light, rbg, brightness
# 5 = temperature, hvac, fans, climate
# 10 = defaults
# 0 = sensor entities, sensors
# 112 = humidity, humidifier integration
# 62 = locks, alarms integrations
# 84 = images, cameras
# 22 = hass object, hass.data etc
# 26 = media players
# 58 = notifications, notification integration
# 69 = buttons, button integration


topics_to_merge = [typing_casting_cluster, redundant_code_comments_cluster, code_style_cluster, thanks_acknowledgements_cluster, process_cluster, imports_cluster, config_entries_cluster, async_eventloop_cluster,
                   error_handling_cluster, logging_cluster, measurements_cluster, icon_name_device_translations_cluster, time_timezone_dates_duration_cluster, classes_inheritance_attributes_cluster, validation_schemas_cluster, dictionary_dictkeys_cluster, update_coordinator_interval_polling_cluster,
                   testing_cluster, external_library_cluster, unique_ids_cluster, assigning_attributes_cluster, entity_cluster, protocols_cluster, device_cluster, validation_schemas_cluster, state_cluster, domain_cluster, service_cluster,
                   authentication_cluster, voice_assistant_conversation_cluster, questions_cluster, other_cluster]

# merge topics and re-assign topics to input data
topic_model.merge_topics(threads, topics_to_merge)
topics, probs = topic_model.transform(threads)


# Updated Topic Info and Hierarchy

In [None]:
print(topic_model.get_topic_info())

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(threads)
print(topic_model.get_topic_tree(hierarchical_topics))


In [None]:
# get updated info about specific clusters
print(topic_model.get_representative_docs(61))

topics_df = pd.DataFrame({'topic': topics, 'document': threads})
print(topics_df[topics_df.topic == 61])

In [None]:
# the number of noise comments and not-noise
topic_counts = topic_model.get_topic_info()
noise_counts = topic_counts[topic_counts["Topic"] == -1][["Topic", "Count"]]
valid_counts = topic_counts[topic_counts["Topic"] != -1][["Topic", "Count"]]

print('Noise: ', sum(noise_counts['Count']))
print('Classified comments: ', sum(valid_counts['Count']))