In [50]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re
from umap import UMAP
from hdbscan import HDBSCAN
import torch
from transformers import RobertaTokenizer, RobertaConfig, RobertaModel
from collections.abc import Iterable
from transformers import AutoTokenizer, AutoModel
from bertopic import BERTopic
import torch
from collections import deque
from bertopic.representation import KeyBERTInspired
from tqdm import tqdm
from sklearn.cluster import KMeans
import calendar
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

In [51]:
# 파일 불러오기
with open('../../data/embed_text_list.pkl', 'rb') as f:
    embed_text_list = pickle.load(f)


In [52]:
# 파일 불러오기
with open('../../data/data.pkl', 'rb') as f:
    data = pickle.load(f)


In [53]:
# 파일 불러오기
with open('../../data/reserved_list.pkl', 'rb') as f:
    reserved_list = pickle.load(f)


In [54]:
type(reserved_list)

list

In [55]:
# CodeBERT 모델과 토크나이저 로드
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [56]:
# 모든 텍스트에 대한 임베딩 계산
embeddings = torch.cat(embed_text_list, dim=0).numpy()  # 텐서로 연결 후 numpy 배열로 변환

In [57]:
embeddings.shape

(41900, 768)

In [58]:
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

In [59]:
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
cluster_model = KMeans(n_clusters=30)

In [69]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
# pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# GPT-3.5
prompt = """
I have a topic that contains the documents: [REPRESENTATIVE_DOCUMENT]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
client = openai.OpenAI(api_key="sk-proj-Mm0FIOmjCUgXc2T40TarT3BlbkFJ7YO5SE9kJokyhFrRxEiB")
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model
    # ,
    # "POS": pos_model
}

In [70]:
vectorizer_model = CountVectorizer(stop_words=reserved_list, min_df=2, ngram_range=(1, 2))

In [71]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


In [72]:
# BERTopic 모델 초기화 및 훈련
topic_model = BERTopic( embedding_model=model,
                        umap_model=umap_model,
                        # hdbscan_model=hdbscan_model,
                        hdbscan_model=cluster_model,
                        ctfidf_model=ctfidf_model,
                        representation_model=representation_model)  # 임베딩 모델 사용을 비활성화
topics, probabilities = topic_model.fit_transform(data, embeddings)


In [73]:
# 결과 출력
print(topic_model.get_topic_info())  # 토픽 정보 출력

    Topic  Count                                           Name  \
0       0   3755                               0_df_foo_not_amp   
1       1   3555                     1_public_new_string_return   
2       2   3219                         2_select_unknown_df_as   
3       3   3041                     3_const_classname_this_new   
4       4   2771                   4_serve_fiscal_code_nls_yarn   
5       5   2535                            5_self_def_print_as   
6       6   2408              6_public_background_border_margin   
7       7   2404              7_scrollbar_spanid_webview_tempid   
8       8   2253                  8_elemnt_typeerror_not_cannot   
9       9   2142                              9_at_java_lib_org   
10     10   1963                        10_install_npm_git_sudo   
11     11   1688               11_localhost_install_8080_docker   
12     12   1687                         12_x00_taxrate_amp_txt   
13     13   1658                        13_feature_ger_enum_bp

In [74]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI,MMR,Representative_Docs
0,0,3755,0_df_foo_not_amp,"[df, foo, not, amp, is, to, 000, name, type, s...","[error, bin, csv, object, values, 16, name, se...",[Data Selection and Filtering],"[df, foo, not, amp, is, to, 000, name, type, s...",[download failed: s3://greenwichhr-covidjobimp...
1,1,3555,1_public_new_string_return,"[public, new, string, return, int, this, const...","[await, async, session, log, email, message, j...",[Programming Concepts with Keywords],"[public, new, string, return, int, this, const...","[function queryTestDb(query, pool) {\n\t// cre..."
2,2,3219,2_select_unknown_df_as,"[select, unknown, df, as, where, from, 12, not...","[table1, dataframe, table, join, csv, error, p...",[Data Selection and Filtering],"[select, unknown, df, as, where, from, 12, not...",[I have Django project of login and signup and...
3,3,3041,3_const_classname_this_new,"[const, classname, this, new, if, var, return,...","[react, usestate, error, require, button, cons...",[JavaScript OOP with React],"[const, classname, this, new, if, var, return,...",[\t * DataTables Basic\n\t */\n\t\t$(function ...
4,4,2771,4_serve_fiscal_code_nls_yarn,"[serve, fiscal_code, nls, yarn, ssm, dockerfil...","[yarn, bundle, lint, serve, pod, sock_raw, doc...",[Server-side Docker deployment],"[serve, fiscal_code, nls, yarn, ssm, dockerfil...","[yarn add ...., yarn server, pod spec lint]"
5,5,2535,5_self_def_print_as,"[self, def, print, as, np, na, 1l, range, impo...","[plot, numpy, row, axis, df, plt, keras, appen...",[Python data manipulation tutorial],"[self, def, print, as, np, na, 1l, range, impo...",[import numpy as np\nfrom keras.models import ...
6,6,2408,6_public_background_border_margin,"[public, background, border, margin, color, st...","[hover, background, display, margin, center, t...",[Web Design Styles],"[public, background, border, margin, color, st...","[:root {\n --background: rgba(50, 135, 242, 0..."
7,7,2404,7_scrollbar_spanid_webview_tempid,"[scrollbar, spanid, webview, tempid, quosure, ...","[scrollbar, page, tabpage, viewbag, contentvie...",[Web View Modifications],"[scrollbar, spanid, webview, tempid, quosure, ...","[Scrollbar, ScrollBar, ScrollBar]"
8,8,2253,8_elemnt_typeerror_not_cannot,"[elemnt, typeerror, not, cannot, undefined, is...","[undefined, typeerror, attributeerror, valueer...",[JavaScript Typeerror Handling],"[elemnt, typeerror, not, cannot, undefined, is...",[Error: Evaluation failed: TypeError: Cannot r...
9,9,2142,9_at_java_lib_org,"[at, java, lib, org, py, packages, users, line...","[failed, error, springframework, jar, spring, ...",[Library Packages and Users],"[at, java, lib, org, py, packages, users, line...",[\tjava.lang.IllegalStateException: Failed to ...


In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_topic(6, full=True)

In [None]:
# # Label the topics yourself
# topic_model.set_topic_labels({1: "Space Travel", 7: "Religion"})

# # or use one of the other topic representations, like KeyBERTInspired
# keybert_topic_labels = {topic: " | ".join(list(zip(*values))[0][:3]) for topic, values in topic_model.topic_aspects_["KeyBERT"].items()}
# topic_model.set_topic_labels(keybert_topic_labels)

# # or ChatGPT's labels
# chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["OpenAI"].items()}
# chatgpt_topic_labels[-1] = "Outlier Topic"
# topic_model.set_topic_labels(chatgpt_topic_labels)

In [None]:
# topic_distr, _ = topic_model.approximate_distribution(data, window=8, stride=4)

In [None]:
# pip install nbformat>=4.2.0

In [None]:
# Visualize the topic-document distribution for a single document
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(data, embeddings=embeddings)

In [None]:
topic_model.visualize_documents(data, reduced_embeddings=reduced_embeddings)

In [None]:
topa_df = topic_model.get_document_info(data)

In [None]:
topa_df

In [None]:
sample_src_df[sample_src_df['q_id'] ==77394187]

In [None]:
topa_org_df = sample_src_df.copy()
topa_org_df.reset_index(drop=True, inplace=True)
topa_df.reset_index(drop=True, inplace=True)

In [None]:
tot_topa_df = pd.concat([topa_org_df, topa_df], axis = 1)

In [None]:
tot_topa_df

In [None]:
tot_topa_df[(tot_topa_df['Topic'] ==0)&(tot_topa_df['Representative_document'] ==True)]

In [None]:
sample_dropnull_df = sample_df[['q_id', 'q_creationdate', 'a_id']].groupby(['q_id', 'q_creationdate']).head(1)

In [None]:
tot_topa_df = pd.merge(tot_topa_df, sample_dropnull_df, how='left', left_on = 'q_id', right_on = 'q_id')

In [None]:
tot_topa_df[tot_topa_df['q_id'] == 77394187]

In [None]:
tot_topa_df['date'] = pd.to_datetime(tot_topa_df['q_creationdate']).dt.date

In [None]:
tot_topa_not_answered_df = tot_topa_df[tot_topa_df['a_id'].isna()]
tot_topa_answered_df = tot_topa_df[tot_topa_df['a_id'].isna()==False]

In [None]:
tot_topa_not_answered_rate = tot_topa_not_answered_df.groupby(['date', 'Topic']).count()[['q_id']].reset_index()
tot_topa_answered_rate = tot_topa_answered_df.groupby(['date', 'Topic']).count()[['q_id']].reset_index()

In [None]:
tot_topa_not_answered_sum = tot_topa_not_answered_rate.groupby(['date']).sum()[['q_id']].reset_index()
tot_topa_answered_sum = tot_topa_answered_rate.groupby(['date']).sum()[['q_id']].reset_index()

In [None]:
tot_topa_not_answered_rate = pd.merge(tot_topa_not_answered_rate, tot_topa_not_answered_sum, on = 'date')
tot_topa_answered_rate = pd.merge(tot_topa_answered_rate, tot_topa_answered_sum, on = 'date')

In [None]:
tot_topa_not_answered_rate

In [None]:
# topic_per_rate = tot_topa_not_answered_rate.rename({'q_id_x': 'cnt', 'q_id_y':'tot_cnt'})

In [None]:
tot_topa_not_answered_rate['rate'] = tot_topa_not_answered_rate['q_id_x']/tot_topa_not_answered_rate['q_id_y']*100
tot_topa_answered_rate['rate'] = tot_topa_answered_rate['q_id_x']/tot_topa_answered_rate['q_id_y']*100


In [None]:
tot_topa_not_answered_rate

In [None]:
tot_topa_not_answered_rate_pivot = pd.pivot_table(tot_topa_not_answered_rate, index = 'date', columns = 'Topic',values = 'rate').reset_index()
tot_topa_answered_rate_pivot = pd.pivot_table(tot_topa_answered_rate, index = 'date', columns = 'Topic',values = 'rate').reset_index()

In [None]:
tot_topa_not_answered_rate_pivot.fillna(0, inplace=True)
tot_topa_answered_rate_pivot.fillna(0, inplace=True)

In [None]:
tot_topa_not_answered_rate_pivot

In [None]:
tot_topa_not_answered_rate_pivot['sdate'] = tot_topa_not_answered_rate_pivot['date'].astype('str')
tot_topa_answered_rate_pivot['sdate'] = tot_topa_answered_rate_pivot['date'].astype('str')

In [None]:
tot_topa_not_answered_rate_pivot.to_csv('./chk_not_answered')

In [None]:
### Problem 2b: Reformat the data and generate the stacked bar plot above.


fig, ax = plt.subplots(figsize = (10, 5))

# ax.bar(topic_per_pivot['date'], topic_per_pivot[0], width = 0.5, color='blue', label = 'Party A')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_not_answered_rate_pivot[0], color='red', label = 'Party A')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_not_answered_rate_pivot[1], bottom = tot_topa_not_answered_rate_pivot[0], color = 'orange', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_not_answered_rate_pivot[2], bottom = tot_topa_not_answered_rate_pivot[1] + tot_topa_not_answered_rate_pivot[0], color = 'green', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_not_answered_rate_pivot[3], bottom = tot_topa_not_answered_rate_pivot[2] + tot_topa_not_answered_rate_pivot[1] + tot_topa_not_answered_rate_pivot[0], color = 'yellow', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_not_answered_rate_pivot[4], bottom = tot_topa_not_answered_rate_pivot[3] + tot_topa_not_answered_rate_pivot[2] + tot_topa_not_answered_rate_pivot[1] + tot_topa_not_answered_rate_pivot[0], color = 'blue', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_not_answered_rate_pivot[5], bottom = tot_topa_not_answered_rate_pivot[4] + tot_topa_not_answered_rate_pivot[3] + tot_topa_not_answered_rate_pivot[2] + tot_topa_not_answered_rate_pivot[1] + tot_topa_not_answered_rate_pivot[0], color = 'purple', label = 'Party B')


# ax.set_xlabel('Congress', fontsize = 5, labelpad = 20)
# ax.set_xlabel('Congress', fontsize = 5)
# ax.set_ylabel('Number of Seats', fontsize = 5, labelpad = 20)
# ax.set_title('Partisan Composition of Congress', fontsize = 17.5, pad = 20)

# ax.tick_params(axis = 'both', labelsize = 5)

# ax.legend(title = "Political Party",
#           title_fontsize = 15, fontsize = 5,
#           bbox_to_anchor = (1, 0.55))

plt.xticks(rotation = 45)
plt.show();

In [None]:
### Problem 2b: Reformat the data and generate the stacked bar plot above.


fig, ax = plt.subplots(figsize = (10, 5))

# ax.bar(topic_per_pivot['date'], topic_per_pivot[0], width = 0.5, color='blue', label = 'Party A')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_answered_rate_pivot[0], color='red', label = 'Party A')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_answered_rate_pivot[1], bottom = tot_topa_answered_rate_pivot[0], color = 'orange', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_answered_rate_pivot[2], bottom = tot_topa_answered_rate_pivot[1] + tot_topa_answered_rate_pivot[0], color = 'green', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_answered_rate_pivot[3], bottom = tot_topa_answered_rate_pivot[2] + tot_topa_answered_rate_pivot[1] + tot_topa_answered_rate_pivot[0], color = 'yellow', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_answered_rate_pivot[4], bottom = tot_topa_answered_rate_pivot[3] + tot_topa_answered_rate_pivot[2] + tot_topa_answered_rate_pivot[1] + tot_topa_answered_rate_pivot[0], color = 'blue', label = 'Party B')
ax.bar(tot_topa_answered_rate_pivot['sdate'], tot_topa_answered_rate_pivot[5], bottom = tot_topa_answered_rate_pivot[4] + tot_topa_answered_rate_pivot[3] + tot_topa_answered_rate_pivot[2] + tot_topa_answered_rate_pivot[1] + tot_topa_answered_rate_pivot[0], color = 'purple', label = 'Party B')


# ax.set_xlabel('Congress', fontsize = 5, labelpad = 20)
# ax.set_xlabel('Congress', fontsize = 5)
# ax.set_ylabel('Number of Seats', fontsize = 5, labelpad = 20)
# ax.set_title('Partisan Composition of Congress', fontsize = 17.5, pad = 20)

# ax.tick_params(axis = 'both', labelsize = 5)

# ax.legend(title = "Political Party",
#           title_fontsize = 15, fontsize = 5,
#           bbox_to_anchor = (1, 0.55))

plt.xticks(rotation = 45)
plt.show();

In [None]:
idx = np.arange(tot_topa_answered_rate_pivot.shape[0])
w = 0.15

plt.figure(figsize = (10, 5))
plt.title('user count who leaves question and answer per month ')
plt.xlabel('month(in 2023)')
plt.ylabel('user count')
plt.bar(idx - w * w, tot_topa_answered_rate_pivot[0], width = w, label = 'question count')
plt.bar(idx + w, tot_topa_not_answered_rate_pivot[0], width = w, label = 'answer count')
plt.xticks(idx, tot_topa_answered_rate_pivot['sdate'], rotation = 30)
plt.legend(ncol = 1)
plt.show()

In [None]:
idx = np.arange(tot_topa_answered_rate_pivot.shape[0])
w = 0.15

plt.figure(figsize = (10, 5))
plt.title('user count who leaves question and answer per month ')
plt.xlabel('month(in 2023)')
plt.ylabel('user count')
plt.bar(idx - w * w, tot_topa_answered_rate_pivot[1], width = w, label = 'question count')
plt.bar(idx + w, tot_topa_not_answered_rate_pivot[1], width = w, label = 'answer count')
plt.xticks(idx, tot_topa_answered_rate_pivot['sdate'], rotation = 30)
plt.legend(ncol = 1)
plt.show()

In [None]:
idx = np.arange(tot_topa_answered_rate_pivot.shape[0])
w = 0.15

plt.figure(figsize = (10, 5))
plt.title('user count who leaves question and answer per month ')
plt.xlabel('month(in 2023)')
plt.ylabel('user count')
plt.bar(idx - w * w, tot_topa_answered_rate_pivot[2], width = w, label = 'question count')
plt.bar(idx + w, tot_topa_not_answered_rate_pivot[2], width = w, label = 'answer count')
plt.xticks(idx, tot_topa_answered_rate_pivot['sdate'], rotation = 30)
plt.legend(ncol = 1)
plt.show()

In [None]:
idx = np.arange(tot_topa_answered_rate_pivot.shape[0])
w = 0.15

plt.figure(figsize = (10, 5))
plt.title('user count who leaves question and answer per month ')
plt.xlabel('month(in 2023)')
plt.ylabel('user count')
plt.bar(idx - w * w, tot_topa_answered_rate_pivot[3], width = w, label = 'question count')
plt.bar(idx + w, tot_topa_not_answered_rate_pivot[3], width = w, label = 'answer count')
plt.xticks(idx, tot_topa_answered_rate_pivot['sdate'], rotation = 30)
plt.legend(ncol = 1)
plt.show()

In [None]:
idx = np.arange(tot_topa_answered_rate_pivot.shape[0])
w = 0.15

plt.figure(figsize = (10, 5))
plt.title('user count who leaves question and answer per month ')
plt.xlabel('month(in 2023)')
plt.ylabel('user count')
plt.bar(idx - w * w, tot_topa_answered_rate_pivot[2], width = w, label = 'question count')
# plt.bar(idx + w, tot_topa_not_answered_rate_pivot[3], width = w, label = 'answer count')
plt.xticks(idx, tot_topa_answered_rate_pivot['sdate'], rotation = 30)
plt.legend(ncol = 1)
plt.show()

In [None]:
qna_topic_docu.groupby(['q_id']).count()[['Topic']]

In [None]:
qna_topic_docu[qna_topic_docu['q_id'] ==70162810]

In [None]:
sample_dropnull_df = sample_df[['q_id', 'q_creationdate', 'a_id']].groupby(['q_id', 'q_creationdate']).head(1)

In [None]:
sample_dropnull_df[sample_dropnull_df['q_id'] ==70162810]