In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
import umap.umap_ as umap
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from collections.abc import Iterable
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
import torch




In [2]:
# 파일 불러오기
with open('../../data/bert_df.pkl', 'rb') as f:
    df = pickle.load(f)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3487 entries, 0 to 3486
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   q_id                3487 non-null   int64         
 1   q_posttypeid        3487 non-null   object        
 2   q_acceptedanswerid  1488 non-null   float64       
 3   q_parentid          0 non-null      object        
 4   q_creationdate      3487 non-null   datetime64[ns]
 5   q_score             3487 non-null   int64         
 6   q_viewcount         3487 non-null   int64         
 7   q_owneruserid       3487 non-null   int64         
 8   q_title             3487 non-null   object        
 9   q_tags              3487 non-null   object        
 10  q_answercount       3487 non-null   int64         
 11  q_commentcount      3487 non-null   int64         
 12  q_reputation        3487 non-null   int64         
 13  q_text              3487 non-null   object      

In [4]:
# extract the  columns for text analysis
## q_body : question body
## a_body : answer body
df_qna = df[['q_id','a_id','q_text', 'a_text']]

In [5]:
df_qna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3487 entries, 0 to 3486
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   q_id    3487 non-null   int64 
 1   a_id    3487 non-null   int64 
 2   q_text  3487 non-null   object
 3   a_text  3487 non-null   object
dtypes: int64(2), object(2)
memory usage: 109.1+ KB


In [6]:
def cleanhtml(raw_html):
  # 1.Source code in python language is hard to understand, so replace all the <code> tag first
  cleantext_1 = re.findall(r'(?<=\<code>)(.*?)(?=<\/code>)', raw_html.replace('\n', '_**_'))
  cleantext_1 = [x.replace('_**_', '\n') for x in cleantext_1]
  # 2. replace html tags
  # <p>
  tag_re = re.compile('<.*?>')
  cleantext_2 = [re.sub(tag_re, '', x) for x in cleantext_1]
  return cleantext_2

In [7]:
# apply the function, cleanhtml to the question and body text
df_qna.loc[:, 'q_prep_text'] = df_qna['q_text'].apply(cleanhtml)
df_qna.loc[:, 'a_prep_text'] = df_qna['a_text'].apply(cleanhtml)

In [8]:
df_qna.loc[:, [ 'q_id','q_prep_text', 'a_prep_text']] 

Unnamed: 0,q_id,q_prep_text,a_prep_text
0,77593805,[],[n = 20000\nsum_of_numbers = (n * (n + 1)) // ...
1,77593717,[import hashlib\n\nuser_hash_dict = {}\n\nwith...,[from hashlib import sha256 as SHA256\n\ncpd =...
2,77591118,[London:Alpha\nLondon\nLondon:Beta\nLondon:Del...,"[s = pd.Series(['London:Alpha', 'London', 'Lon..."
3,77590853,"[-v, -o, -A, --script, -Pn, -IL]","[pip install python-nmap\n, import nmap\n\nde..."
4,77591142,[],[]
...,...,...,...
3482,77581497,[df\n\nDevice int In Out Bw_in Bw_out\n...,"[duckdb, import duckdb\n\nconn = duckdb.connec..."
3483,77580251,[55297173-0087-1 \n56397873-0186 \n57885358-...,[ df=pd.DataFrame([x.strip() for x in '''552...
3484,77567490,"[onnxruntime-silicon, onnxruntime, Flask, # Se...","[load_model, post_worker_init, sess = None\n\n..."
3485,77582066,[],"[Day, Store ID, Worker ID, # create datetime o..."


In [9]:
df_q_src = df_qna[['q_id', 'q_prep_text']].apply(pd.Series.explode)
df_a_src = df_qna[['a_id', 'a_prep_text']].apply(pd.Series.explode)

In [10]:
df_q_src = df_q_src.reset_index(drop=True)
df_a_src = df_a_src.reset_index(drop=True)

In [11]:
df_q_src.dropna(inplace=True)
df_q_src['q_prep_text_non'] = df_q_src['q_prep_text'].str.replace('\n', ' ')

In [12]:
df_q_src['q_prep_text_non']

1        import hashlib  user_hash_dict = {}  with open...
2        London:Alpha London London:Beta London:Delta P...
3        London_sub:Alpha London_sub London_sub:Beta Lo...
4        names_df[0] = names_df[0] \         .str.split...
5                                     L:o:n:d:o:n:_:s:u:b 
                               ...                        
11467    class GunicornApplication(BaseApplication):   ...
11468    if __name__ == '__main__':     options = {'bin...
11469    &gt;&gt;&gt; [ERROR] Worker (pid:10517) was se...
11470    &gt;&gt;&gt; requests.exceptions.ConnectionErr...
11471                                             Gunicorn
Name: q_prep_text_non, Length: 11171, dtype: object

In [13]:
src = df_q_src['q_prep_text_non'].tolist()
# df['sentiments'].values.tolist()


In [14]:
print(type(src))
print(type(src[0]))
print(len(src))

<class 'list'>
<class 'str'>
11171


In [15]:
src = src[:100]

In [16]:
# 데이터 로드
data = src

In [17]:
data = data

In [18]:
# CodeBERT 모델과 토크나이저 로드
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [19]:
# 임베딩 함수 정의
def embed_text(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

In [20]:
# 모든 텍스트에 대한 임베딩 계산
embeddings = torch.cat([embed_text(text) for text in data], dim=0).numpy()  # 텐서로 연결 후 numpy 배열로 변환

In [21]:

# BERTopic 모델 초기화 및 훈련
topic_model = BERTopic()  # 임베딩 모델 사용을 비활성화
topics, probabilities = topic_model.fit_transform(data, embeddings)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
# 결과 출력
print(topic_model.get_topic_info())  # 토픽 정보 출력

   Topic  Count                         Name  \
0     -1     10  -1_namesdf0_hash_for_import   

                                      Representation  \
0  [namesdf0, hash, for, import, columninteger, i...   

                                 Representative_Docs  
0  [-v, -o, -A, --script, -Pn, import hashlib  us...  


In [23]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,10,-1_namesdf0_hash_for_import,"[namesdf0, hash, for, import, columninteger, i...","[-v, -o, -A, --script, -Pn, import hashlib us..."
