In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import string
from sklearn.cluster import KMeans
from collections import defaultdict
from kiwipiepy import Kiwi

## 토크나이저 정의

In [2]:
tokenizer = AutoTokenizer.from_pretrained('monologg/koelectra-small-v3-discriminator')
# tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

## 리뷰 불러오기 & 전처리

In [5]:
def clean_text(sent, remove_tag = True):
    """
    특수 문자, 문장 부호, 조건 태그 제거
    """
    if remove_tag:
        sent = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s0-9a-zA-Z]", " ", sent)
    else:
        sent = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s0-9a-zA-Z<>]", " ", sent)
    sent = re.sub("[ㄱ-ㅎㅏ-ㅣ]+", "", sent) # 초성체 제거
    sent = " ".join(sent.split()) # 공백 최소화
    sent = sent.strip()
    return sent

In [17]:
reviews = pd.read_csv("/opt/ml/input/data/reviews_ver2_1.csv")["context"]
reviews = [clean_text(text) for text in reviews]
len(reviews)

200

In [None]:
reviews = pd.read_csv("./review_떡볶이_ver2.1.csv")['review_content']

reviews = [re.sub(r"<[^>]+>\s+(?=<)|<[^>]+>", "", text) for text in reviews]
reviews = [re.sub(r"[^가-힣a-zA-Z0-9\n\s]", "", text) for text in reviews]

reviews = pd.read_csv("./review_떡볶이_ver2_pre.csv")['review_content']
reviews = [re.sub(r"<[^>]+>\s+(?=<)|<[^>]+>", "", text) for text in reviews]
reviews = [re.sub(r"[^가-힣a-zA-Z0-9\n\s]", "", text).strip() for text in reviews]

In [18]:
def filter_corpus(corpus):
    corpus = [text for text in corpus if text.find("배송") == -1]
    corpus = [text for text in corpus if text.find("도움") == -1]
    corpus = [text for text in corpus if len(text) > 10]
    corpus = [text for text in corpus if len(text) < 200]
    corpus = [text for text in corpus if text.find("년") == -1]
    corpus = [text for text in corpus if text.find("ml") == -1]
    corpus = [text for text in corpus if text.find("날짜") == -1]
    corpus = [text for text in corpus if text.find("22") == -1]
    corpus = [text for text in corpus if text.find("23") == -1]
    corpus = [text for text in corpus if text.find("기한") == -1]
    corpus = [text for text in corpus if text.find("미리") == -1]
    corpus = [text for text in corpus if text.find("감사") == -1]
    return corpus

## 문장 단위로 분리하기

In [26]:
kiwi = Kiwi()

split_texts = []
for review in reviews:
    sents = kiwi.split_into_sents(review)
    sents = [sent.text for sent in sents]
    split_texts.append(sents)

corpus = []
for idx, text in enumerate(split_texts):
    for t in text:
        corpus.append(str(idx) + " " + t)

pre_len = len(corpus)
corpus = filter_corpus(corpus)

## 클러스터링 진행하기

In [27]:
text_id = 1
clusters = ["평가", "조리"]
# data = clusters + filter_corpus(split_texts[text_id])
data = clusters + corpus
n = len(clusters)

# Vectorizer
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
tokenizer_func = lambda x: tokenizer.tokenize(x.translate(remove_punct_dict))
vectorizer = TfidfVectorizer(tokenizer=tokenizer_func, ngram_range=(1,2))

# Feature vectorize
feature_vect = vectorizer.fit_transform(data)

# 임의 클러스터로 클러스터 진행
km_cluster = KMeans(n_clusters=n, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect[:n])

# 임의 클러스터로 새로운 클러스터링 초기화
kmeans_new = KMeans(init=km_cluster.cluster_centers_, n_clusters=n)
kmeans_new.fit(feature_vect[n:])

result = [[] for _ in range(n)]
for idx, label in enumerate(km_cluster.labels_):
    result[label].append(clusters[idx])

for idx, label in enumerate(kmeans_new.labels_):
    result[label].append(data[idx+n])

result_dict = {}
for r in result:
    r_dict = defaultdict(list)
    for text in r[1:]:
        index = text.split(" ")[0]
        t =" ".join(text.split(" ")[1:])
        r_dict[index].append(t)
    # result_dict[r[0]] = r[1:]
    result_dict[r[0]] = dict(r_dict)

  super()._check_params_vs_input(X, default_n_init=10)


## 클러스터별 + 리뷰별로 분리하기

In [32]:
def compress_score(before_len, after_len):
    print(f"{before_len} -> {after_len}, 삭제 개수: {before_len - after_len}, 압축 비율: {(1 - after_len/before_len)*100: .02f}%")
    return before_len / after_len

cur_len = sum([len(sents) for sents in result_dict["평가"].values()])
compress_score(pre_len, cur_len)

1571 -> 1210, 삭제 개수: 361, 압축 비율:  22.98%


1.2983471074380166

In [21]:
filtered_text = [[] for _ in range(len(split_texts))]

for idx in result_dict["평가"]:
    filtered_text[int(idx)].extend(result_dict["평가"][idx])

text_list = [ ". ".join(text) for text in filtered_text ]

## 상품별로 리뷰 분리하기

In [22]:
filtered_reviews = []
for i in range(10):
    filtered_reviews.append(" ".join(text_list[i * 20:i * 20 + 20]))

## 상품별 리뷰 Document 길이 확인하기

In [24]:
print(f"Orinigal text length: {[sum([len(re.sub(' ', '', k)) for k in reviews][i * 20: i * 20 + 20]) for i in range(10)]}")
print(f"Filtered text lengths: {[len(re.sub(' ', '', k)) for k in filtered_reviews]}")

Orinigal text length: [5459, 6334, 5376, 4733, 3508, 4579, 4813, 4655, 3609, 5376]
Filtered text lengths: [3383, 4039, 3773, 4079, 3005, 4081, 3347, 4097, 2944, 3788]


65613 -> 49242, 삭제 개수: 16371, 압축 비율:  24.95%
1.3324600950408187
7417 -> 4570, 삭제 개수: 2847, 압축 비율:  38.38%
1.6229759299781181
8571 -> 5449, 삭제 개수: 3122, 압축 비율:  36.43%
1.5729491649844007
7232 -> 5046, 삭제 개수: 2186, 압축 비율:  30.23%
1.433214427269124
6386 -> 5475, 삭제 개수: 911, 압축 비율:  14.27%
1.1663926940639269
4760 -> 4067, 삭제 개수: 693, 압축 비율:  14.56%
1.1703958691910499
6222 -> 5520, 삭제 개수: 702, 압축 비율:  11.28%
1.1271739130434784
6581 -> 4547, 삭제 개수: 2034, 압축 비율:  30.91%
1.4473279085111062
6332 -> 5542, 삭제 개수: 790, 압축 비율:  12.48%
1.1425478166726812
4880 -> 3962, 삭제 개수: 918, 압축 비율:  18.81%
1.231701161029783
7232 -> 5064, 삭제 개수: 2168, 압축 비율:  29.98%
1.4281200631911533
