In [67]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from collections.abc import Iterable
from transformers import AutoTokenizer, AutoModel
import torch
from collections import deque
from tqdm import tqdm
import pickle
import re
from itertools import product

임베딩 만드는 소스 코드 (챗지피티 전후로의 임베딩)

In [68]:
# 파일 불러오기
with open('../../data/bert_src_df.pkl', 'rb') as f:
    bert_src_df = pickle.load(f)


In [69]:
bert_src_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5223203 entries, 0 to 5223202
Data columns (total 5 columns):
 #   Column          Dtype         
---  ------          -----         
 0   q_id            int64         
 1   a_id            float64       
 2   q_creationdate  datetime64[ns]
 3   tags            object        
 4   body            object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 199.2+ MB


In [70]:
cond1 = bert_src_df['q_creationdate']<='2022-11-30'
cond2 = bert_src_df['q_creationdate']>='2021-11-30'
cond3 = bert_src_df['a_id'].isna()
cond4 = bert_src_df['tags'].str.contains('python')
cond5 = bert_src_df['q_creationdate']>='2022-12-01'

In [71]:
bert_src = bert_src_df.loc[cond2 & cond4, :]

In [72]:
bert_src.head()

Unnamed: 0,q_id,a_id,q_creationdate,tags,body
1,77042177,77042771.0,2023-09-05 06:31:45.120,<python><r><websocket><discord>,<p>I am using the package (websocket) for R to...
8,77040069,77040480.0,2023-09-04 19:01:20.293,<python>,<p>I have a text file like this:</p>\n<pre><co...
9,77040069,77040243.0,2023-09-04 19:01:20.293,<python>,<p>I have a text file like this:</p>\n<pre><co...
10,77040321,77165037.0,2023-09-04 19:57:43.350,<python><indexing>,<p>I am confused by the differences in my code...
38,77077227,,2023-09-10 17:31:31.600,<python><opengl><pyqt5><pyopengl>,<p>I want to display 3d object on top of PyQT5...


In [73]:
def cleanhtml(raw_html):
  # 1.Source code in python language is hard to understand, so replace all the <code> tag first
  cleantext_1 = re.findall(r'(?<=\<code>)(.*?)(?=<\/code>)', raw_html.replace('\n', '_**_'))
  cleantext_1 = [x.replace('_**_', '\n') for x in cleantext_1]
  # 2. replace html tags
  # <p>
  tag_re = re.compile('<.*?>')
  cleantext_2 = [re.sub(tag_re, '', x) for x in cleantext_1]
  return cleantext_2

In [74]:
# apply the function, cleanhtml to the question and body text
bert_src.loc[:, 'q_prep_text'] = bert_src['body'].apply(cleanhtml)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bert_src.loc[:, 'q_prep_text'] = bert_src['body'].apply(cleanhtml)


In [75]:
bert_src = bert_src.reset_index(drop=True)

In [76]:
bert_src_ex = bert_src[['q_id', 'q_prep_text']].apply(pd.Series.explode)

In [77]:
bert_src_ex.dropna(inplace=True)

In [78]:
bert_src = pd.merge(bert_src_ex, bert_src[['q_id', 'q_creationdate']], how ='left', left_on = 'q_id', right_on = 'q_id')

In [79]:
bert_src['q_year'] = pd.to_datetime(bert_src['q_creationdate']).dt.year
bert_src['q_month'] = pd.to_datetime(bert_src['q_creationdate']).dt.month

In [80]:
# PREPROCESSING FOR CODE SCRIPT
def preprocess_script(script):
    new_script = deque()
    old_script = script.split('\n')
    for line in old_script:
        if line.lstrip().startswith('#'): # 주석으로 시작되는 행 skip
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')] # 주석 전까지 코드만 저장
        line = line.replace('\n','') # 개행 문자를 모두 삭제함
        line = line.replace('    ','\t') # 공백 4칸을 tab으로 변환
        
        if line == '': # 전처리 후 빈 라인은 skip
            continue
        
        new_script.append(line)

        
    new_script = '\n'.join(new_script) # 개행 문자로 합침
    new_script = re.sub('("""[\w\W]*?""")', '<str>', new_script)
    new_script = re.sub("('''[\w\W]*?''')", '<str>', new_script)
    new_script = re.sub('/^(http?|https?):\/\/([a-z0-9-]+\.)+[a-z0-9]{2,4}.*$/', '', new_script)
    
    return new_script


In [81]:
bert_src['q_prep_text_non'] = bert_src['q_prep_text'].apply(preprocess_script)

In [82]:
years = [2022, 2023]
months = [x for x in range(1, 13)]

In [83]:
ym = list(product(years, months))

In [84]:
# CodeBERT 모델과 토크나이저 로드
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [85]:
# 임베딩 함수 정의
def embed_text(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0, :]

In [86]:
for year, month in ym : 
    cond1 = (bert_src['q_year'] ==year)
    cond2 = (bert_src['q_month'] ==month)
    src = bert_src.loc[cond1&cond2, 'q_prep_text_non'].tolist()

    embed_text_list = []
    for i in tqdm(range(len(src))) : 
        embed_text_list.append(embed_text(src[i]))

    file_nm = 'embed_text_list'+str(year)+str(month)+'.pkl'
    # 전체 데이터셋 
    with open('../../data/src/embeding/'+file_nm, 'wb') as f:
        pickle.dump(embed_text_list, f)

  1%|▏         | 2056/148244 [01:58<2:20:51, 17.30it/s]


KeyboardInterrupt: 