# Import dependencies

In [1]:
import pandas as pd
import ast
import re
import umap
from dotenv import load_dotenv
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

# Read and clean data

In [2]:


def safe_literal_eval(val):
    if pd.isna(val):  
        return []     
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return [] 
    
def clean_text(text):
    # quotes
    text = re.sub(r"(?m)^\s*>.*(?:\r?\n|$)", "", text)

    # links and code
    # should we consider using LLM to generate short summary of the code snippets so we don't lose any context it could provide
    # or could use regex to detect patterns in the code and classify them e.g. detecting import statements, function definitions, or added/removed lines.
    pattern = r"```.*?```|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    cleaned_text = re.sub(pattern, "", text, flags=re.DOTALL)

    # keep only alphanumeric characters and punctuation
    cleaned_text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"(){}\[\]\-]", " ", cleaned_text)

    # remove extra spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()

    return cleaned_text
    
def preprocess_text(comments_sequence):
    all_comments = []
    for issue_comment in comments_sequence:
        body = clean_text(issue_comment['comment']['body'])
        all_comments.append(body)
    return all_comments

df= pd.read_csv('data/pull_requests_filtered_raw.csv')
df['comments'] = df['comments'].apply(safe_literal_eval)

df['issue_comments'] = df['comments'].apply(lambda comments: [item for item in comments if item['type'] == 'issue'] if type(comments) is not float else comments)
df = df[df['issue_comments'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df['processed_issue_comments'] = df['issue_comments'].apply(preprocess_text)

issue_comments = df['processed_issue_comments'].explode().tolist()

# Cluster Using Bertopic

In [None]:
# https://bertopic.readthedocs.io/en/latest/
# https://maartengr.github.io/BERTopic/api/representation/keybert.html
# https://maartengr.github.io/BERTopic/getting_started/vectorizers/vectorizers.html

# used to fine-tune topic representations
representation_model = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
umap_model = umap.UMAP(n_neighbors=15, n_components=5, random_state=42)     # fix topics across runs by setting random_state; otherwise UMAP is stochastic

# using pre-calculated embeddings
# topic_model = BERTopic(min_topic_size=20)
# topics, probs = topic_model.fit_transform(threads, thread_embeddings)

# using KeyBERTInspired to generate embeddings
topic_model = BERTopic(representation_model=representation_model, vectorizer_model=vectorizer_model, umap_model=umap_model, min_topic_size=20)
topics, probs = topic_model.fit_transform(issue_comments)
hierarchical_topics = topic_model.hierarchical_topics(issue_comments)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got f

## General Topic Info

In [6]:
# set Pandas formatting for printing
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_rows', None)

In [7]:
print(topic_model.get_topic_info())
# print(topic_model.generate_topic_labels(nr_words=1))

topic_model.visualize_barchart()

     Topic  Count  \
0       -1  10004   
1        0    545   
2        1    489   
3        2    474   
4        3    401   
5        4    328   
6        5    240   
7        6    234   
8        7    234   
9        8    225   
10       9    201   
11      10    199   
12      11    183   
13      12    173   
14      13    173   
15      14    168   
16      15    165   
17      16    164   
18      17    159   
19      18    158   
20      19    153   
21      20    150   
22      21    143   
23      22    141   
24      23    141   
25      24    141   
26      25    137   
27      26    137   
28      27    131   
29      28    130   
30      29    129   
31      30    129   
32      31    121   
33      32    120   
34      33    119   
35      34    114   
36      35    113   
37      36    111   
38      37    105   
39      38    105   
40      39    104   
41      40    101   
42      41     98   
43      42     95   
44      43     90   
45      44     89   
46      45   

## Topic Hierarchy

In [None]:
# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html

# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
print(topic_model.get_topic_tree(hierarchical_topics))


In [9]:
# use this cell to get info about specific topics in hierarchy
print(topic_model.get_representative_docs(0))

topics_df = pd.DataFrame({'topic': topics, 'document': issue_comments})
print(topics_df[topics_df.topic == 0])

['Oh no I mean, you can maybe already start looking at a second platform PR like binary sensor or cover or something like that. We can only add more sensors after this one has been merged', 'I guess I can add the new port sensors to the existing sensors section... sensors', 'Converted 2 state sensors to binary sensors']
       topic  \
70         0   
158        0   
182        0   
197        0   
198        0   
202        0   
313        0   
328        0   
354        0   
355        0   
357        0   
362        0   
364        0   
402        0   
424        0   
444        0   
456        0   
472        0   
473        0   
482        0   
483        0   
488        0   
492        0   
494        0   
506        0   
507        0   
536        0   
537        0   
538        0   
541        0   
656        0   
661        0   
669        0   
700        0   
739        0   
740        0   
741        0   
911        0   
924        0   
1010       0   
1053       0   
1109  