#### 필요 모듈 install and import

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 23.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 63.9 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 54.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 47.4 MB/s 
Building wheels for collected pa

In [None]:
import numpy as np
import itertools
import pickle

#from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv("/content/drive/MyDrive/SKKU_ML/data/news_data.csv", encoding='utf-8-sig')

In [None]:
data

In [None]:
data_list = ['nv', 'cs', 'cs_sum']
for i in data_list:
    with open(f"/content/drive/MyDrive/SKKU_ML/data/news_{i}.pickle","rb") as fr:
        globals()[f'news_{i}'] = pickle.load(fr)

data_nv: 명사, 동사, 형용사가 토큰화된 apace로 구분된 데이터

data_cs: 위와 같지만 comma로 구분된 데이터

data_cs_sum: 위 데이터를 하나의 doc으로 합친 데이터

#### 통합 문서를 embedding 하기 위해 join

In [None]:
news_nv_join = []
news_nv_join.append(' '.join(news_nv))

In [None]:
news_nv_join

#### 통합문서의 임베딩을 위해 docs에 data.content를 더하고 join

In [None]:
docs = []
for i in range(len(data)):
    docs.append(data['content'][i])

In [None]:
docs_join = []
docs_join.append(' '.join(docs))

#### KeyBERT candidate

In [None]:
n_gram_range = (2,3)

In [None]:
count = CountVectorizer(ngram_range = n_gram_range).fit(news_nv_join)

In [None]:
candidates = count.get_feature_names_out()

In [None]:
print('bigram과 trigram 개수:', len(candidates))

In [None]:
print('trigram 5 : ', candidates[:5])

#### 다국어 sbert load

In [None]:
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [None]:
doc_embedding = model.encode(docs_join)

In [None]:
candidate_embeddings = model.encode(candidates)

### Cosine Similarity 기준 키워드 추출

In [None]:
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [None]:
keywords

### Max Sum Similarity 알고리즘을 통한 키워드 추출

In [None]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    distances = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # 코사인 유사도 기반 키워드들 중 상위 top_n개의 단어 선택
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

#### nr_candidates 10

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

#### nr_candidates 30
#### nr_candidates 를 높이면 더 다양한 키워드 추출 가능

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=30)

### Maximal Marginal Relevance 알고리즘을 통한 키워드 추출

In [None]:
def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):

    # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
    word_doc_similarity = cosine_similarity(candidate_embeddings, doc_embedding)

    # 각 키워드들 간의 유사도
    word_similarity = cosine_similarity(candidate_embeddings)

    # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
    keywords_idx = [np.argmax(word_doc_similarity)]

    # 가장 높은 유사도를 가진 키워드의 인덱스를 제외한 문서의 인덱스들
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    # 최고의 키워드는 이미 추출했으므로 top_n-1번만큼 아래를 반복.
    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # MMR을 계산
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # keywords & candidates를 업데이트
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

#### diversity 0.2

In [None]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.2)

#### diversity 0.7
#### diversity 를 높이면 다양한 키워드 추출 가능

In [None]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.7)