https://zerojsh00.github.io/posts/BERTopic/
https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6
https://arxiv.org/pdf/2203.05794.pdf

https://dacon.io/competitions/official/235900/codeshare?page=1&dtype=recent&ptype=pub&keyword=

In [43]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
import umap.umap_ as umap
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from collections.abc import Iterable

In [44]:
# 파일 불러오기
with open('../../data/bert_df.pkl', 'rb') as f:
    df = pickle.load(f)


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3487 entries, 0 to 3486
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   q_id                3487 non-null   int64         
 1   q_posttypeid        3487 non-null   object        
 2   q_acceptedanswerid  1488 non-null   float64       
 3   q_parentid          0 non-null      object        
 4   q_creationdate      3487 non-null   datetime64[ns]
 5   q_score             3487 non-null   int64         
 6   q_viewcount         3487 non-null   int64         
 7   q_owneruserid       3487 non-null   int64         
 8   q_title             3487 non-null   object        
 9   q_tags              3487 non-null   object        
 10  q_answercount       3487 non-null   int64         
 11  q_commentcount      3487 non-null   int64         
 12  q_reputation        3487 non-null   int64         
 13  q_text              3487 non-null   object      

In [46]:
# extract the  columns for text analysis
## q_body : question body
## a_body : answer body
df_qna = df[['q_id','a_id','q_text', 'a_text']]

In [47]:
df_qna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3487 entries, 0 to 3486
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   q_id    3487 non-null   int64 
 1   a_id    3487 non-null   int64 
 2   q_text  3487 non-null   object
 3   a_text  3487 non-null   object
dtypes: int64(2), object(2)
memory usage: 109.1+ KB


In [48]:
def cleanhtml(raw_html):
  # 1.Source code in python language is hard to understand, so replace all the <code> tag first
  cleantext_1 = re.findall(r'(?<=\<code>)(.*?)(?=<\/code>)', raw_html.replace('\n', '_**_'))
  cleantext_1 = [x.replace('_**_', '\n') for x in cleantext_1]
  # 2. replace html tags
  # <p>
  tag_re = re.compile('<.*?>')
  cleantext_2 = [re.sub(tag_re, '', x) for x in cleantext_1]
  return cleantext_2

In [49]:
# apply the function, cleanhtml to the question and body text
df_qna.loc[:, 'q_prep_text'] = df_qna['q_text'].apply(cleanhtml)
df_qna.loc[:, 'a_prep_text'] = df_qna['a_text'].apply(cleanhtml)

In [50]:
df_qna.loc[:, [ 'q_id','q_prep_text', 'a_prep_text']] 

Unnamed: 0,q_id,q_prep_text,a_prep_text
0,77593805,[],[n = 20000\nsum_of_numbers = (n * (n + 1)) // ...
1,77593717,[import hashlib\n\nuser_hash_dict = {}\n\nwith...,[from hashlib import sha256 as SHA256\n\ncpd =...
2,77591118,[London:Alpha\nLondon\nLondon:Beta\nLondon:Del...,"[s = pd.Series(['London:Alpha', 'London', 'Lon..."
3,77590853,"[-v, -o, -A, --script, -Pn, -IL]","[pip install python-nmap\n, import nmap\n\nde..."
4,77591142,[],[]
...,...,...,...
3482,77581497,[df\n\nDevice int In Out Bw_in Bw_out\n...,"[duckdb, import duckdb\n\nconn = duckdb.connec..."
3483,77580251,[55297173-0087-1 \n56397873-0186 \n57885358-...,[ df=pd.DataFrame([x.strip() for x in '''552...
3484,77567490,"[onnxruntime-silicon, onnxruntime, Flask, # Se...","[load_model, post_worker_init, sess = None\n\n..."
3485,77582066,[],"[Day, Store ID, Worker ID, # create datetime o..."


In [51]:
df_q_src = df_qna[['q_id', 'q_prep_text']].apply(pd.Series.explode)
df_a_src = df_qna[['a_id', 'a_prep_text']].apply(pd.Series.explode)

In [52]:
df_q_src = df_q_src.reset_index(drop=True)
df_a_src = df_a_src.reset_index(drop=True)

In [53]:
df_q_src.dropna(inplace=True)
df_q_src['q_prep_text_non'] = df_q_src['q_prep_text'].str.replace('\n', ' ')

In [84]:
df_q_src['q_prep_text_non']

1        import hashlib  user_hash_dict = {}  with open...
2        London:Alpha London London:Beta London:Delta P...
3        London_sub:Alpha London_sub London_sub:Beta Lo...
4        names_df[0] = names_df[0] \         .str.split...
5                                     L:o:n:d:o:n:_:s:u:b 
                               ...                        
11467    class GunicornApplication(BaseApplication):   ...
11468    if __name__ == '__main__':     options = {'bin...
11469    &gt;&gt;&gt; [ERROR] Worker (pid:10517) was se...
11470    &gt;&gt;&gt; requests.exceptions.ConnectionErr...
11471                                             Gunicorn
Name: q_prep_text_non, Length: 11171, dtype: object

In [87]:
src = df_q_src['q_prep_text_non'].tolist()
# df['sentiments'].values.tolist()


In [88]:
print(type(src))
print(type(src[0]))
print(len(src))

<class 'list'>
<class 'str'>
11171


In [91]:
src = src[:10]

Pre-calculate Embeddings

In [77]:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
embedding_model = AutoModel.from_pretrained("microsoft/codebert-base")

In [92]:
# https://stackoverflow.com/questions/63517293/valueerror-textencodeinput-must-be-uniontextinputsequence-tupleinputsequence/63870843#63870843
code_tokens=tokenizer(src, truncation=True, padding=True)
tokens_ids=tokenizer.convert_tokens_to_ids(code_tokens)
embeddings=embedding_model(torch.tensor(tokens_ids)[None,:])[0]
# embeddings = embeddings.reshape(-1,768)

In [59]:
# tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
# model = AutoModel.from_pretrained("microsoft/codebert-base")
# code_tokens=tokenizer.tokenize(src[0])

In [60]:
embeddings.shape

torch.Size([1, 255, 768])

Train

In [61]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  # umap_model=umap_model,
  # hdbscan_model=hdbscan_model,
  # vectorizer_model=vectorizer_model,
  # representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)



In [62]:
type(embeddings)

# Convert the tensor to a NumPy array
embeddings_numpy = embeddings.detach().numpy()
# embeddings_numpy = embeddings_numpy[0]
print(type(embeddings_numpy))

<class 'numpy.ndarray'>


In [63]:
test = 'The problem of statistical learning is to construct a predictor of a random\nvariable $Y$ as a function of a related random variable $X$ on the basis of an\ni.i.d. training sample from the joint distribution of $(X,Y)$. Allowable\npredictors are drawn from some specified class, and the goal is to approach\nasymptotically the performance (expected loss) of the best predictor in the\nclass. We consider the setting in which one has perfect observation of the\n$X$-part of the sample, while the $Y$-part has to be communicated at some\nfinite bit rate. The encoding of the $Y$-values is allowed to depend on the\n$X$-values. Under suitable regularity conditions on the admissible predictors,\nthe underlying family of probability distributions and the loss function, we\ngive an information-theoretic characterization of achievable predictor\nperformance in terms of conditional distortion-rate functions. The ideas are\nillustrated on the example of nonparametric regression in Gaussian noise.\n'

In [64]:
test

'The problem of statistical learning is to construct a predictor of a random\nvariable $Y$ as a function of a related random variable $X$ on the basis of an\ni.i.d. training sample from the joint distribution of $(X,Y)$. Allowable\npredictors are drawn from some specified class, and the goal is to approach\nasymptotically the performance (expected loss) of the best predictor in the\nclass. We consider the setting in which one has perfect observation of the\n$X$-part of the sample, while the $Y$-part has to be communicated at some\nfinite bit rate. The encoding of the $Y$-values is allowed to depend on the\n$X$-values. Under suitable regularity conditions on the admissible predictors,\nthe underlying family of probability distributions and the loss function, we\ngive an information-theoretic characterization of achievable predictor\nperformance in terms of conditional distortion-rate functions. The ideas are\nillustrated on the example of nonparametric regression in Gaussian noise.\n'

In [66]:
topics, probs = topic_model.fit_transform(src[0:10], embeddings_numpy)

ValueError: Make sure that the embeddings are a numpy array with shape: (len(docs), vector_dim) where vector_dim is the dimensionality of the vector embeddings. 

In [65]:
type(embeddings_numpy)

if isinstance(src[0:2], pd.DataFrame):
        raise TypeError("Make sure to supply a list of strings, not a dataframe.")
elif isinstance(src[0:2], Iterable) and not isinstance(src[0:2], str):
    if not any([isinstance(doc, str) for doc in src[0:2]]):
        raise TypeError("Make sure that the iterable only contains strings.")
else:
    raise TypeError("Make sure that the documents variable is an iterable containing strings only.")


In [None]:
test

if isinstance(test, pd.DataFrame):
        raise TypeError("Make sure to supply a list of strings, not a dataframe.")
elif isinstance(test, Iterable) and not isinstance(test, str):
    if not any([isinstance(doc, str) for doc in test]):
        raise TypeError("Make sure that the iterable only contains strings.")
else:
    raise TypeError("Make sure that the documents variable is an iterable containing strings only.")



TypeError: Make sure that the documents variable is an iterable containing strings only.

In [None]:
if not isinstance(src[0], str):
    print("aaa")
else :
    print("bbb")

bbb


In [None]:
isinstance(src[0], str)


True

In [None]:
embeddings_numpy.shape

(1, 255, 768)

In [None]:
df_q_src

In [None]:
df_q_src= df_q_src['q_prep_text_non'].apply(tokenizer.tokenize)