In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
from umap import UMAP
from hdbscan import HDBSCAN
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from collections.abc import Iterable
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
import torch
from collections import deque
from bertopic.representation import KeyBERTInspired



In [3]:
# 파일 불러오기
with open('../../data/bert_src_df.pkl', 'rb') as f:
    bert_src_df = pickle.load(f)


In [4]:
bert_src_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5223203 entries, 0 to 5223202
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   q_id            int64         
 1   a_id            float64       
 2   q_creationdate  datetime64[ns]
 3   tags            object        
 4   body            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 199.2+ MB


In [5]:
cond1 = bert_src_df['q_creationdate']<='2022-11-30'
cond2 = bert_src_df['q_creationdate']>='2021-11-30'
cond3 = bert_src_df['a_id'].isna()
cond4 = bert_src_df['tags'].str.contains('python')
cond5 = bert_src_df['q_creationdate']>='2022-12-01'

In [6]:
bert_src_bf = bert_src_df.loc[cond1 & cond2 & cond3 & cond4, :]
bert_src_af = bert_src_df.loc[cond1 & cond3 & cond4 & cond5, :]

In [7]:
bert_src_bf.head()

Unnamed: 0,q_id,a_id,q_creationdate,tags,body
1277,74605388,74660649.0,2022-11-28 19:03:09.213,<python><list><discord>,<p>I wanted to do a discord command scraper in...
1278,74601289,74601433.0,2022-11-28 13:26:01.467,<python><tensorflow><keras>,<p>I am trying to randomly generate timeseries...
1279,74601457,74601499.0,2022-11-28 13:39:25.053,<python><pandas><dataframe><group-by><data-sci...,<p>Ok so this is more of a question about how ...
1284,74598689,74695040.0,2022-11-28 09:44:13.533,<python><apache-spark><pyspark>,"<p>Given a <a href=""https://spark.apache.org/d..."
1285,74598689,74703354.0,2022-11-28 09:44:13.533,<python><apache-spark><pyspark>,"<p>Given a <a href=""https://spark.apache.org/d..."


In [8]:
def cleanhtml(raw_html):
  # 1.Source code in python language is hard to understand, so replace all the <code> tag first
  cleantext_1 = re.findall(r'(?<=\<code>)(.*?)(?=<\/code>)', raw_html.replace('\n', '_**_'))
  cleantext_1 = [x.replace('_**_', '\n') for x in cleantext_1]
  # 2. replace html tags
  # <p>
  tag_re = re.compile('<.*?>')
  cleantext_2 = [re.sub(tag_re, '', x) for x in cleantext_1]
  return cleantext_2

In [9]:
# apply the function, cleanhtml to the question and body text
bert_src_bf.loc[:, 'q_prep_text'] = bert_src_bf['body'].apply(cleanhtml)
bert_src_af.loc[:, 'q_prep_text'] = bert_src_bf['body'].apply(cleanhtml)

In [10]:
bert_src_bf = bert_src_bf.reset_index(drop=True)
bert_src_af = bert_src_af.reset_index(drop=True)

In [11]:
bert_src_bf = bert_src_bf[['q_id', 'q_prep_text']].apply(pd.Series.explode)
bert_src_af = bert_src_af[['q_id', 'q_prep_text']].apply(pd.Series.explode)

In [12]:
bert_src_bf.dropna(inplace=True)
bert_src_af.dropna(inplace=True)

In [13]:
# PREPROCESSING FOR CODE SCRIPT
def preprocess_script(script):
    new_script = deque()
    old_script = script.split('\n')
    for line in old_script:
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','') # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t') # 공백 4칸을 tab으로 변환
        
        if line == '': # 전처리 후 빈 라인은 skip
            continue
        
        new_script.append(line)

        
    new_script = '\n'.join(new_script) # 개행 문자로 합침
    new_script = re.sub('("""[\w\W]*?""")', '<str>', new_script)
    new_script = re.sub("('''[\w\W]*?''')", '<str>', new_script)
    new_script = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', new_script)
    
    return new_script


In [14]:
bert_src_bf['q_prep_text_non'] = bert_src_bf['q_prep_text'].apply(preprocess_script)
bert_src_af['q_prep_text_non'] = bert_src_af['q_prep_text'].apply(preprocess_script)

In [15]:
src = bert_src_bf['q_prep_text_non'].tolist()
# df['sentiments'].values.tolist()


In [16]:
print(type(src))
print(type(src[0]))
print(len(src))

<class 'list'>
<class 'str'>
811999


In [17]:
src = src[:10000]

In [18]:
# 데이터 로드
data = src

In [19]:
data = data

In [20]:
# CodeBERT 모델과 토크나이저 로드
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [21]:
# 임베딩 함수 정의
def embed_text(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

In [22]:
# 모든 텍스트에 대한 임베딩 계산
embeddings = torch.cat([embed_text(text) for text in data], dim=0).numpy()  # 텐서로 연결 후 numpy 배열로 변환

In [None]:
embeddings.shape

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
representation_model = KeyBERTInspired()

In [None]:
# BERTopic 모델 초기화 및 훈련
topic_model = BERTopic( embedding_model=model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model,
                        representation_model=representation_model)  # 임베딩 모델 사용을 비활성화
topics, probabilities = topic_model.fit_transform(data, embeddings)


In [None]:
# 결과 출력
print(topic_model.get_topic_info())  # 토픽 정보 출력

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_topic(6, full=True)

In [None]:
# # Label the topics yourself
# topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"})

# # or use one of the other topic representations, like KeyBERTInspired
# keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
# topic_model.set_topic_labels(keybert_topic_labels)

# # or ChatGPT's labels
# chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
# chatgpt_topic_labels[-1] = "Outlier Topic"
# topic_model.set_topic_labels(chatgpt_topic_labels)

In [None]:
# topic_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)

In [None]:
# pip install nbformat>=4.2.0

In [None]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(data, embeddings=embeddings)

In [None]:
topic_model.visualize_documents(data, reduced_embeddings=reduced_embeddings)