In [39]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
from umap import UMAP
from hdbscan import HDBSCAN
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from collections.abc import Iterable
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
import torch
from collections import deque


In [40]:
# 파일 불러오기
with open('../../data/bert_df.pkl', 'rb') as f:
    df = pickle.load(f)


In [41]:
# 파일 불러오기
with open('../../data/bert_src_df.pkl', 'rb') as f:
    bert_src_df = pickle.load(f)


In [42]:
bert_src_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5223203 entries, 0 to 5223202
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   q_id            int64         
 1   a_id            float64       
 2   q_creationdate  datetime64[ns]
 3   tags            object        
 4   body            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 199.2+ MB


In [43]:
cond1 = bert_src_df['q_creationdate']<='2022-11-30'
cond2 = bert_src_df['q_creationdate']>='2021-11-30'
cond3 = bert_src_df['a_id'].isna()
cond4 = bert_src_df['tags'].str.contains('python')
cond5 = bert_src_df['q_creationdate']>='2022-12-01'

In [44]:
bert_src_bf = bert_src_df.loc[cond1 & cond2 & cond3 & cond4, :]
bert_src_af = bert_src_df.loc[cond3 & cond4 & cond5, :]

In [45]:
bert_src_af

Unnamed: 0,q_id,a_id,q_creationdate,tags,body
38,77077227,,2023-09-10 17:31:31.600,<python><opengl><pyqt5><pyopengl>,<p>I want to display 3d object on top of PyQT5...
65,77060888,,2023-09-07 15:17:24.040,<python><django><postgresql><sleep><pytest-dja...,<p>I need to make a pause in my Django project...
73,77052025,,2023-09-06 12:43:17.777,<python>,<p>I am trying to get a list of all compartmen...
107,76937634,,2023-08-20 01:18:19.203,<python><django><asynchronous><celery><telegram>,<p>I'm trying to make a parser like a web appl...
115,76934523,,2023-08-19 09:57:53.073,<python><user-interface><pyqt5>,"<p>guys I have a question, i have a code and I..."
...,...,...,...,...,...
5223137,77545627,,2023-11-24 20:54:56.123,<python><huggingface><language-translation>,<h2>Setup</h2>\n<p>I've created a HF Inference...
5223163,77442875,,2023-11-08 04:40:57.380,<python><python-imaging-library><height><width>,<p>We were given a image(The Traffic stop sign...
5223179,77374676,,2023-10-27 13:47:11.517,<python><tensorflow><keras>,<p>I am having an issue I don't understand how...
5223196,77336446,,2023-10-21 14:33:34.623,<python><animation><sequence><vtk>,<p>I have succesfully managed to make a 3d ima...


In [46]:
def cleanhtml(raw_html):
  # 1.Source code in python language is hard to understand, so replace all the <code> tag first
  cleantext_1 = re.findall(r'(?<=\<code>)(.*?)(?=<\/code>)', raw_html.replace('\n', '_**_'))
  cleantext_1 = [x.replace('_**_', '\n') for x in cleantext_1]
  # 2. replace html tags
  # <p>
  tag_re = re.compile('<.*?>')
  cleantext_2 = [re.sub(tag_re, '', x) for x in cleantext_1]
  return cleantext_2

In [47]:
# apply the function, cleanhtml to the question and body text
bert_src_bf.loc[:, 'q_prep_text'] = bert_src_bf['body'].apply(cleanhtml)
bert_src_af.loc[:, 'q_prep_text'] = bert_src_af['body'].apply(cleanhtml)

In [48]:
bert_src_af

Unnamed: 0,q_id,a_id,q_creationdate,tags,body,q_prep_text
38,77077227,,2023-09-10 17:31:31.600,<python><opengl><pyqt5><pyopengl>,<p>I want to display 3d object on top of PyQT5...,[import sys\nfrom OpenGL.GL import *\nfrom Ope...
65,77060888,,2023-09-07 15:17:24.040,<python><django><postgresql><sleep><pytest-dja...,<p>I need to make a pause in my Django project...,"[from pytest import mark, fixture, raises\nfro..."
73,77052025,,2023-09-06 12:43:17.777,<python>,<p>I am trying to get a list of all compartmen...,[import oci\n\nconfig = oci.config.from_file()...
107,76937634,,2023-08-20 01:18:19.203,<python><django><asynchronous><celery><telegram>,<p>I'm trying to make a parser like a web appl...,[from telethon.sync import TelegramClient\nimp...
115,76934523,,2023-08-19 09:57:53.073,<python><user-interface><pyqt5>,"<p>guys I have a question, i have a code and I...",[1\.when moving the icon instances the X and Y...
...,...,...,...,...,...,...
5223137,77545627,,2023-11-24 20:54:56.123,<python><huggingface><language-translation>,<h2>Setup</h2>\n<p>I've created a HF Inference...,"[request, concurrent, def translate_text(text)..."
5223163,77442875,,2023-11-08 04:40:57.380,<python><python-imaging-library><height><width>,<p>We were given a image(The Traffic stop sign...,[ def fix_middle(picture):\n picture...
5223179,77374676,,2023-10-27 13:47:11.517,<python><tensorflow><keras>,<p>I am having an issue I don't understand how...,"[ def create_teacher_model(img_size, model_..."
5223196,77336446,,2023-10-21 14:33:34.623,<python><animation><sequence><vtk>,<p>I have succesfully managed to make a 3d ima...,[import vtkmodules.all as vtk\n\n#Create a ren...


In [49]:
bert_src_bf = bert_src_bf.reset_index(drop=True)
bert_src_af = bert_src_af.reset_index(drop=True)

In [53]:
bert_src_bf = bert_src_bf[['q_id', 'q_prep_text']].apply(pd.Series.explode)
bert_src_af = bert_src_af[['q_id', 'q_prep_text']].apply(pd.Series.explode)

In [None]:
bert_src_bf.dropna(inplace=True)
bert_src_af.dropna(inplace=True)

In [54]:
# PREPROCESSING FOR CODE SCRIPT
def preprocess_script(script):
    new_script = deque()
    old_script = script.split('\n')
    for line in old_script:
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','') # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t') # 공백 4칸을 tab으로 변환
        
        if line == '': # 전처리 후 빈 라인은 skip
            continue
        
        new_script.append(line)

        
    new_script = '\n'.join(new_script) # 개행 문자로 합침
    new_script = re.sub('("""[\w\W]*?""")', '<str>', new_script)
    new_script = re.sub("('''[\w\W]*?''')", '<str>', new_script)
    new_script = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', new_script)
    
    return new_script


In [55]:
bert_src_af

Unnamed: 0,q_id,q_prep_text
0,77077227,import sys\nfrom OpenGL.GL import *\nfrom Open...
1,77060888,"from pytest import mark, fixture, raises\nfrom..."
1,77060888,========================================= test...
2,77052025,import oci\n\nconfig = oci.config.from_file()\...
3,76937634,from telethon.sync import TelegramClient\nimpo...
...,...,...
63443,77336446,import time\nimport vtkmodules.all\n\nclass vt...
63444,77336843,apiVersion: networking.k8s.io/v1\nkind: Ingres...
63444,77336843,apiVersion: networking.k8s.io/v1\nkind: Ingres...
63444,77336843,apiVersion: v1\nkind: Service\nmetadata:\n na...


In [16]:
bert_src_bf['q_prep_text_non'] = bert_src_bf['q_prep_text'].apply(preprocess_script)
bert_src_af['q_prep_text_non'] = bert_src_af['q_prep_text'].apply(preprocess_script)

In [17]:
src = bert_src_af['q_prep_text_non'].tolist()
# df['sentiments'].values.tolist()


In [20]:
src

[]

In [18]:
print(type(src))
print(type(src[0]))
print(len(src))

<class 'list'>


IndexError: list index out of range

In [None]:
src = src[:10000]

In [None]:
# 데이터 로드
data = src

In [None]:
data = data

In [None]:
# CodeBERT 모델과 토크나이저 로드
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
# 임베딩 함수 정의
def embed_text(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

In [None]:
# 모든 텍스트에 대한 임베딩 계산
embeddings = torch.cat([embed_text(text) for text in data], dim=0).numpy()  # 텐서로 연결 후 numpy 배열로 변환

In [None]:
embeddings.shape

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [None]:
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
# BERTopic 모델 초기화 및 훈련
topic_model = BERTopic( embedding_model=model,
                        umap_model=umap_model,
                        hdbscan_model=hdbscan_model)  # 임베딩 모델 사용을 비활성화
topics, probabilities = topic_model.fit_transform(data, embeddings)


In [None]:
# 결과 출력
print(topic_model.get_topic_info())  # 토픽 정보 출력

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_topic(6, full=True)

In [None]:
# # Label the topics yourself
# topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"})

# # or use one of the other topic representations, like KeyBERTInspired
# keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
# topic_model.set_topic_labels(keybert_topic_labels)

# # or ChatGPT's labels
# chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
# chatgpt_topic_labels[-1] = "Outlier Topic"
# topic_model.set_topic_labels(chatgpt_topic_labels)

In [None]:
# topic_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)

In [None]:
# pip install nbformat>=4.2.0

In [None]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(data, embeddings=embeddings)

In [None]:
topic_model.visualize_documents(data, reduced_embeddings=reduced_embeddings)