# Import dependencies

In [16]:
import pandas as pd
import ast
import re
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import umap
import hdbscan
import matplotlib.pyplot as plt
import joblib
from openai import OpenAI
from dotenv import load_dotenv
from os import environ
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer

load_dotenv()

True

# Read and clean data

In [10]:


def safe_literal_eval(val):
    if pd.isna(val):  
        return []     
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return [] 
    
def clean_text(text):
    # quotes
    text = re.sub(r"(?m)^\s*>.*(?:\r?\n|$)", "", text)

    # links and code
    # should we consider using LLM to generate short summary of the code snippets so we don't lose any context it could provide
    # or could use regex to detect patterns in the code and classify them e.g. detecting import statements, function definitions, or added/removed lines.
    pattern = r"```.*?```|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)

    # keep only alphanumeric characters and punctuation
    cleaned_text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"(){}\[\]\-]", " ", cleaned_text)

    # remove extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text
    
def preprocess_text(comments_sequence):
    all_comments = []
    for issue_comment in comments_sequence:
        body = clean_text(issue_comment['comment']['body'])
        all_comments.append(body)
    return all_comments

df= pd.read_csv('data/pull_requests_filtered_raw.csv')
df['comments'] = df['comments'].apply(safe_literal_eval)

df['issue_comments'] = df['comments'].apply(lambda comments: [item for item in comments if item['type'] == 'issue'] if type(comments) is not float else comments)
df = df[df['issue_comments'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df['processed_issue_comments'] = df['issue_comments'].apply(preprocess_text)

issue_comments = df['processed_issue_comments'].explode().tolist()

# Cluster Using Bertopic

In [11]:
# https://bertopic.readthedocs.io/en/latest/
# https://maartengr.github.io/BERTopic/api/representation/keybert.html
# https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html

# used to fine-tune topic representations
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = umap.UMAP(n_neighbors=15, n_components=5, random_state=42)     # fix topics across runs by setting random_state; otherwise UMAP is stochastic

# using pre-calculated embeddings
# topic_model = BERTopic(min_topic_size=20)
# topics, probs = topic_model.fit_transform(threads, thread_embeddings)

# using KeyBERTInspired to generate embeddings
topic_model = BERTopic(representation_model=representation_model, vectorizer_model=vectorizer_model, umap_model=umap_model, min_topic_size=20)
topics, probs = topic_model.fit_transform(issue_comments)
hierarchical_topics = topic_model.hierarchical_topics(issue_comments)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got f

## General Topic Info

In [17]:
print(topic_model.get_topic_info())
# print(topic_model.generate_topic_labels(nr_words=1))

topic_model.visualize_barchart()

     Topic  Count                                               Name  \
0       -1  10004                        -1_api_entities_entity_core   
1        0    545  0_sensors_sensor_sensorentitydescription_sensorpy   
2        1    489                      1_cla_pull_signed_cyclops1982   
3        2    474                   2_rebase_branch_rebased_branches   
4        3    401             3_assistant_homekit_homeassistant_home   
..     ...    ...                                                ...   
159    158     21      158_ready_engrbm87_muppet3000_congratulations   
160    159     20                159_zcl_zonnsmarttv01_zha_zonnsmart   
161    160     20         160_constants_constant_deprecated_literals   
162    161     20                        161_merged_pr_hotfix_branch   
163    162     20                 162_rytilahti_merged_merge_pending   

                                        Representation  \
0    [api, entities, entity, core, service, data, t...   
1    [sensors, sens

## Topic Hierarchy

In [13]:
# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html

# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
print(topic_model.get_topic_tree(hierarchical_topics))

# merge topics as necessary e.g.
# topics_to_merge = [63, 126, 86, 177, 55, 172, 156, 185]     # all about removing commented out code or stale comments/docstrings
# topic_model.merge_topics(threads, topics_to_merge)


.
├─devices_config_sensor_configuration_yaml
│    ├─homekit_assistant_merged_update_dev
│    │    ├─homekit_integration_integrate_integrations_component
│    │    │    ├─beta_release_upcoming_releases_milestone
│    │    │    │    ├─■──progress_status_waiting_updates_soon ── Topic: 61
│    │    │    │    └─beta_release_releases_released_upcoming
│    │    │    │         ├─■──merged_pr_hotfix_branch_release ── Topic: 161
│    │    │    │         └─■──beta_release_releases_released_upcoming ── Topic: 14
│    │    │    └─merged_homekit_hacs_assistant_integrated
│    │    │         ├─integrated_integration_homekit_assistant_integrate
│    │    │         │    ├─assistant_homekit_homeassistant_home_merged
│    │    │         │    │    ├─■──assistant_homekit_homeassistant_home_api ── Topic: 3
│    │    │         │    │    └─■──integration_integrate_integrations_integrated_merged ── Topic: 4
│    │    │         │    └─■──hacs_repository_meross_integrated_src ── Topic: 10
│    │    │         └─

In [15]:
# use this cell to get info about specific topics in hierarchy

topic_model.get_representative_docs(61)

['Still no progress?',
 'Will this progress further and if so when?',
 'Still no progress?']