BERT를 이용한 키워드 추출 : 키버트(KeyBERT)

In [1]:
import pandas as pd
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

In [2]:
# 데이터 로딩 및 전처리
file_path = 'english_data_yt.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        name, content = line.split(':', 1)
        data.append({'name': name.strip(), 'content': content.strip()})


In [3]:
df = pd.DataFrame(data)
df = df.drop([7, 60, 74])
df = df.reset_index(drop=True)

In [4]:
# 불용어 제거
compile = re.compile("[^ \+|a-z|A-Z|ㄱ-ㅣ가-힣]+")

# 불용어 목록
in_person_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]


In [5]:
stop_words = in_person_stopwords
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
def remove_stopwords_from_list(content_list, stopwords):
    filtered_list = []
    for sentence in content_list:
        filtered_sentence = " ".join([word for word in sentence.split() if word not in stopwords])
        filtered_list.append(filtered_sentence)
    return filtered_list

In [7]:
line = df['content'].apply(lambda x: compile.sub("", x))
processed_line = remove_stopwords_from_list(line, in_person_stopwords)

In [8]:
# BERT 모델을 사용한 키워드 추출
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embeddings = model.encode(processed_line)
count = CountVectorizer(max_features=1024, ngram_range=(1, 1))
count.fit(processed_line)
candidates = count.get_feature_names_out()
candidate_embeddings = model.encode(candidates)

In [9]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    distances = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [10]:
def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):

    # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
    word_doc_similarity = cosine_similarity(candidate_embeddings, doc_embedding)

    # 각 키워드들 간의 유사도
    word_similarity = cosine_similarity(candidate_embeddings)

    # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # keywords_idx = [2]
    keywords_idx = [np.argmax(word_doc_similarity)]

    # 가장 높은 유사도를 가진 키워드의 인덱스를 제외한 문서의 인덱스들
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # ==> candidates_idx = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10 ... 중략 ...]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    # 최고의 키워드는 이미 추출했으므로 top_n-1번만큼 아래를 반복.
    # ex) top_n = 5라면, 아래의 loop는 4번 반복됨.
    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # MMR을 계산
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # keywords & candidates를 업데이트
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [12]:
# 각 문서에 대한 키워드 추출
name_to_keywords = {}
for index, row in df.iterrows():
    name = row['name']
    content = processed_line[index]
    doc_embedding = model.encode([content])
    candidate_embeddings = model.encode(candidates)
    keywords = max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)
    name_to_keywords[name] = keywords

# 결과 저장
df['keywords'] = df['name'].map(name_to_keywords)

# 데이터프레임 출력
print(df[['name', 'keywords']])

                      name                                           keywords
0   astrology alchemy0.txt  [philosophy, spacecharge, beans, google, astro...
1   astrology alchemy1.txt  [chemistry, astrological, fermi, spacecharge, ...
2   astrology alchemy2.txt  [classical, recombination, spacecharge, astrol...
3   astrology alchemy3.txt  [astrologers, com, recombination, spacecharge,...
4   astrology alchemy4.txt  [library, spacecharge, academic, recombination...
..                     ...                                                ...
92      semiconductor5.txt     [berlin, cambridge, ieee, spacecharge, google]
93      semiconductor6.txt      [classical, beans, spacecharge, june, google]
94      semiconductor7.txt     [ieee, academic, spacecharge, physics, google]
95      semiconductor8.txt  [spectroscopy, astrologers, fermi, recombinati...
96      semiconductor9.txt  [recombination, ieee, fermi, chatgpt, microscopy]

[97 rows x 2 columns]


----

In [None]:
import pandas as pd[97 rows x 2 columns]
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import itertools

# 데이터 로딩 및 전처리
file_path = 'english_data_yt.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        name, content = line.split(':', 1)
        data.append({'name': name.strip(), 'content': content.strip()})

df = pd.DataFrame(data)
df = df.drop([7, 60, 74])
df = df.reset_index(drop=True)

# 불용어 제거
compile = re.compile("[^ \+|a-z|A-Z|ㄱ-ㅣ가-힣]+")
in_person_stopwords = [...] # 불용어 목록

def remove_stopwords_from_list(content_list, stopwords):
    filtered_list = []
    for sentence in content_list:
        filtered_sentence = " ".join([word for word in sentence.split() if word not in stopwords])
        filtered_list.append(filtered_sentence)
    return filtered_list

line = df['content'].apply(lambda x: compile.sub("", x))
processed_line = remove_stopwords_from_list(line, in_person_stopwords)

# BERT 모델을 사용한 키워드 추출
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embeddings = model.encode(processed_line)
count = CountVectorizer(max_features=1024, ngram_range=(1, 1))
count.fit(processed_line)
candidates = count.get_feature_names_out()
candidate_embeddings = model.encode(candidates)

def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    distances = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]


def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):

    # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
    word_doc_similarity = cosine_similarity(candidate_embeddings, doc_embedding)

    # 각 키워드들 간의 유사도
    word_similarity = cosine_similarity(candidate_embeddings)

    # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # keywords_idx = [2]
    keywords_idx = [np.argmax(word_doc_similarity)]

    # 가장 높은 유사도를 가진 키워드의 인덱스를 제외한 문서의 인덱스들
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # ==> candidates_idx = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10 ... 중략 ...]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    # 최고의 키워드는 이미 추출했으므로 top_n-1번만큼 아래를 반복.
    # ex) top_n = 5라면, 아래의 loop는 4번 반복됨.
    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # MMR을 계산
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # keywords & candidates를 업데이트
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

# 각 문서에 대한 키워드 추출
name_to_keywords = {}
for index, row in df.iterrows():
    name = row['name']
    content = processed_line[index]
    doc_embedding = model.encode([content])
    candidate_embeddings = model.encode(candidates)
    keywords = max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)
    name_to_keywords[name] = keywords

# 결과 저장
df['keywords'] = df['name'].map(name_to_keywords)

# 데이터프레임 출력
print(df[['name', 'keywords']])
