In [1]:
import pandas as pd
import os
from soynlp.tokenizer import LTokenizer
from soynlp.noun import LRNounExtractor
from soynlp.word import WordExtractor

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from bertopic.representation import KeyBERTInspired

In [3]:
IN_DATA_PATH = './data/01_out'
OUT_DATA_PATH = './data/02_out'

in_data_files = []
in_docs = []
total_docs = []

# 폴더내 파일리스트를 가져온다.
input_files = os.listdir(IN_DATA_PATH)

# 입력 파일 로딩 RAW_DATA_PATH에 있는 파일을 DataFrame으로 읽어 드린다.
for in_file in input_files:
    print ("Loading file :%s ..."%in_file)
    df = pd.read_csv(os.path.join(IN_DATA_PATH,in_file), encoding='utf-8-sig')
    docs = df['generated_text'].tolist()
    in_data_files.append(df)
    in_docs.append(docs)
    total_docs.extend(docs)    


Loading file :umsun_A_2_sborder.csv ...
Loading file :umsun_B_sborder.csv ...
Loading file :umsun_clean_C.csv ...


In [4]:
# 불용어를 정의한다
user_stop_word = [ "거", "바", "뻥", "중", "눌", ]

# 학습한다.
noun_extractor = LRNounExtractor()
nouns = noun_extractor.train_extract(total_docs)

word_extractor = WordExtractor(
    min_frequency=4, # example
    min_cohesion_forward=0.05,
    min_right_branching_entropy=0.0
)

word_extractor.train(total_docs)
words = word_extractor.extract()

cohesion_score = {word:score.cohesion_forward for word, score in words.items()}

noun_scores = {noun:score.score for noun, score in nouns.items()}
combined_scores = {noun:score + cohesion_score.get(noun, 0)
    for noun, score in noun_scores.items()}
combined_scores.update(
    {subword:cohesion for subword, cohesion in cohesion_score.items()
    if not (subword in combined_scores)}
)

tokenizer = LTokenizer(scores=combined_scores)

class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, text):
        result = list()
        for word in self.tagger.tokenize(text):
            # 명사이고, 길이가 2이상인 단어이고, 불용어 리스트에 없으면 추가하기
            if word not in user_stop_word: #len(word[0]) > 1 and
                result.append(word)
        return result

[Noun Extractor] used default noun predictor; Sejong corpus predictor
[Noun Extractor] used noun_predictor_sejong
[Noun Extractor] All 2398 r features was loaded
[Noun Extractor] scanning was done (L,R) has (10230, 4980) tokens
[Noun Extractor] building L-R graph was done
[Noun Extractor] 1534 nouns are extracted
training was done. used memory 0.754 Gbory 0.700 Gb
all cohesion probabilities was computed. # words = 17158
all branching entropies was computed # words = 20725
all accessor variety was computed # words = 20725


In [5]:
custom_tokenizer = CustomTokenizer(tokenizer)
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

In [6]:
# Pre-calculate embeddings

#embedding_model = SentenceTransformer("sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens")

embedding_model = SentenceTransformer("snunlp/KR-SBERT-V40K-klueNLI-augSTS")
embeddings = embedding_model.encode(total_docs, show_progress_bar=True)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [7]:
# 차원축소
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [21]:
# zero-shot
ko_classifier = pipeline(
    task='zero-shot-classification',
    model='MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli',
    device=0,
    # hypothesis_template='구매하는 이유는 {} 이다.',
)


In [22]:
CANDIDATE_LABELS = ['첨가물','성분','위험','아기','안심','칼로리','중위험','아이']

In [18]:
# [0] => A파일 , [1] => B파일 , [2] => C파일

docs = in_docs[2]

In [23]:
topic_model = BERTopic(
        embedding_model=embedding_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer,
        nr_topics="auto", # 문서를 대표하는 토픽의 갯수
        # top_n_words=4,
        zeroshot_topic_list=CANDIDATE_LABELS,
        zeroshot_min_similarity=.5,
        # representation_model=ko_classifier,
        representation_model=KeyBERTInspired(),
        calculate_probabilities=True
)
	
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()[:30]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,1003,첨가물,"[첨가물, 첨가, 나트륨, 성분, 먹었는데, 우유, 식품, 영양, 먹으면, 맛있]","[위험 첨가물이 없어서 좋아요., 첨가물이 없어서 좋아요., 첨가물이 없어서 좋아요.]"
1,1,624,성분,"[성분, 상큼, 크림, 향이, 생크림, 향료, 첨가물, 맛도, 젤리, 좋네요.]","[노브랜드 제품이 생각보다 괜찮은 것 같아요 성분도 마음에 드네요., 맛도 성분도 ..."
2,2,135,아기,"[아기, 아기들, 아가, 7개월, 아이, 임신, 먹어요, 먹어요., 먹네요, 있어요.]","[우리 아기가 너무 좋아해요., 아기가 좋아해요., 아기가 좋아해요.]"
3,3,125,안심,"[좋네요., 괜찮, 만족, 좋은, 안심, 다행, 무난, 맘에, 감사, 이네요.]","[괜찮네요., 괜찮네요., 괜찮네요.]"
4,4,38,중위험,"[중위험, 저위험, 중위험군, 고위험, 대비, 무난, 안, 가성비, 하나, 들어]","[중위험., 중위험., 중위험.]"
5,5,32,칼로리,"[칼로리, 다이어트, 섭취, 나트륨, 함량, 맛있어, 다이어터, 감미료, 만해요.,...",[그냥 맛있어서 칼로리는 전혀 안 보였고 생각도 못 했네요 칼로리가 장난 아니네요 ...
6,6,22,아이,"[귀여워요., 귀여운, 아이, 아가, 좋더라고요., 아이들, 부드럽고, 울아가, 너...","[아이보다 제가 더 좋아해요., 아이가 좋아해요., 아이가 좋아해요.]"
7,7,4,위험,"[위험, 중위험군, 저위험, 있어서, 있군요., 회산데, 하하., 아, 있네요, 하나]","[위험분이 있군요., 위험분이 하나 있네요 하하., 거버 유명한 회산데 중위험군 조..."
8,8,3074,-1_먹어요_첨가물_먹었는데_맛있어,"[먹어요, 첨가물, 먹었는데, 맛있어, 맛있, 맛이, 식품, 맛, 맛도, 먹으면]",[다른 과자들에 비해 첨가물이 적어서 많이 먹고 있어요 바삭바삭한 식감이 좋고 짭조...
9,9,1738,0_맛있어_먹어요_맛있_먹어요.,"[맛있어, 먹어요, 맛있, 먹어요., 맛도, 초콜릿, 맛, 좋네요., 맛이, 같아요]",[첨가물이 없는 과자라길래 한 번 구매해서 먹었는데 너무 달지도 않고 바삭바삭 고소...


In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

In [None]:
# Coherence
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

# Preprocess documents for coherence evaluation
documents = pd.DataFrame({"Document": docs, "ID": range(len(docs)), "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ''.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer for BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for topic coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

# Extract words in each topic if they are non-empty and exist in the dictionary
topic_words = []
#topic_words = [list(zip(*model_2024_B_3.get_topic(topic)))[0] for topic in range(len(set(topics))-1)]
for topic in range(len(set(topics))-topic_model._outliers):
  words = list(zip(*topic_model.get_topic(topic)))[0]
  words = [word for word in words if word in dictionary.token2id]
  topic_words.append(words)
topic_words = [words for words in topic_words if len(words)>0]

# Evaluate Coherence
coherence_model = CoherenceModel(topics=topic_words, texts=tokens, corpus=corpus, dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()
print("Coherence Score: ", coherence)