# Pipeline Overview

1. Data loader (Preprocessed)
2. Document embedding
- Sentence Transformer
3. Dimension Reduction
- UMAP
4. Clustering
- HDBScan
5. Topic extraction
- LDA, C-TF-IDF

In [None]:
!python --version


Python 3.10.12


# Import Library

In [None]:
!pip install pyLDAvis
!pip install numpy==1.23.5
!pip install pandas==1.5.3
!pip install hdbscan
!pip install python-box
!pip install joblib
!pip install -U sentence-transformers
!pip install -e .
!pip install umap-learn
!pip install datashader bokeh holoviews scikit-image and colorcet

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-1.26.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
Collecting pandas>=2.0.0 (from pyLDAvis)
  Downloading pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting tzdata>=2022.1 (from pandas>=2.0.0->pyLDAvis)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m 

Collecting pandas==1.5.3
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.1
    Uninstalling pandas-2.1.1:
      Successfully uninstalled pandas-2.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.
pyldavis 3.4.1 requires numpy>=1.24.2, but you have numpy 1.23.5 which is incompatible.
pyldavis 3.4.1 requires pandas>=2.0.0, but you have pandas 1.5.3 which is i

In [None]:
import numpy as np
import pandas as pd

# Sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn import mixture
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV

# Sentence Transformer
from sentence_transformers.readers import InputExample
from sentence_transformers import SentenceTransformer, models
from tqdm import tqdm

# UMAP
import umap.umap_ as umap
import umap.plot

# Clustering
import hdbscan
from scipy.spatial import distance # To calculate distances
import scipy.cluster.hierarchy as sch
from joblib import Memory

# Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# Topic modeling
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from box import Box


# Config

In [None]:
config = {
    "random_state": 42,
    "document_embedding": {
        "max_features": 6000 # 데이터셋 크기 축소에 따라 수정 10000 # 환경에 따라 메모리 부족할 수도 있음. 터지면 줄여서 사용 요망
    },
    "dim_reduction": {
        "n_components": 50,
    },
    "clustering": {
        "n_clusters": 3
    },
    "hdbscan" :{
      "min_cluster_size" : 40,
      "min_samples" : 40
    },
    "lda": {
        "n_components": 10,
        "passes": 10,
        "num_keywords": 30,
        "num_keywords_per_cluster": 5
    },
    "ctfidf" : {
        "num_topN": 5,
        "min_topic_size": 10,
    }
}

config = Box(config)
config # config.pca.n_components의 형태로 접근 가능


  and should_run_async(code)


Box({'random_state': 42, 'document_embedding': {'max_features': 6000}, 'dim_reduction': {'n_components': 50}, 'clustering': {'n_clusters': 3}, 'hdbscan': {'min_cluster_size': 40, 'min_samples': 40}, 'lda': {'n_components': 10, 'passes': 10, 'num_keywords': 30, 'num_keywords_per_cluster': 5}, 'ctfidf': {'num_topN': 5, 'min_topic_size': 10}})

# Load data

이 공유 드라이브에 대해 "정리->바로가기 추가"를 통해

내 드라이브 root에 공유 드라이브 바로가기를 만드셔야해요

제출할 때는 압축해서 낼 거니까 이 부분은 없애고 ./ 로 바꿔서 낼게요



In [None]:
# mount Google drive
from google.colab import drive
drive.mount('/content/drive')



  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Set your workspace path

**workspace path 변경해주세요!!!!**

In [None]:
workspace_path = '/content/drive/MyDrive/Final_submit'  # Change this path!
#workspace_path = './'
filename = 'beauty.csv'
# filename = 'whole.csv'
print(f'Current Workspace: {workspace_path}')

try:
  import google.colab
  data_path = f'{workspace_path}/dataset/{filename}'
  cachedir = f'{workspace_path}/cache'
except:
  data_path = f'./dataset/{filename}'



Current Workspace: /content/drive/MyDrive/Final_submit


  and should_run_async(code)


In [None]:
data_path

  and should_run_async(code)


'/content/drive/MyDrive/Final_submit/dataset/beauty.csv'

In [None]:
df = pd.read_csv(data_path)
df.head()

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,review,category,rawReview
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...


# 결측치 제거

In [None]:
df[df["review"].isnull()]


  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,review,category,rawReview


In [None]:
print(df["review"].isnull().sum())
df = df[df["review"].notnull()].reset_index(drop=True)
df.head()


0


  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,review,category,rawReview
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...


In [None]:
category2id = {v:id for id, v in enumerate(df['category'].unique())}
id2category = {id:v for id, v in enumerate(df['category'].unique())}


  and should_run_async(code)


In [None]:
df["category_id"] = df["category"].map(category2id)
df["category_id"].value_counts()


  and should_run_async(code)


0    30000
Name: category_id, dtype: int64

# Document embedding

## SentenceTransformer

In [None]:
# download pretrained model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2') # model 에서 sentence_model 로 변수명 수정했어요. - 윤진

sbert = sentence_model.encode(df["review"])
df["sbert"] = list(sbert)
df.head()

  and should_run_async(code)


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05..."
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112..."
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041..."
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00..."
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01..."


In [None]:
df["sbert"].iloc[0].shape

  and should_run_async(code)


(384,)

# Dimension Reduction

#### UMAP

In [None]:
config.dim_reduction.n_components #확인

  and should_run_async(code)


50

In [None]:
mapper = umap.UMAP(densmap=True, n_components=config.dim_reduction.n_components, random_state=config.random_state)
umap_emb = mapper.fit_transform(sbert)


  and should_run_async(code)
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
umap_emb.shape

  and should_run_async(code)


(30000, 50)

In [None]:
df["dim_reduced"] = list(umap_emb)

  and should_run_async(code)


In [None]:
df["dim_reduced"].iloc[0].shape

  and should_run_async(code)


(50,)

# Clustering

### HDBSCAN

In [None]:
# 빠른 실험을 위한 caching
memory = Memory(cachedir, verbose=0)
import warnings
warnings.filterwarnings('ignore')

  and should_run_async(code)


No tuning

In [None]:
dim_reduced = np.array(df["dim_reduced"].tolist())
clustering_model = hdbscan.HDBSCAN(gen_min_span_tree=True, min_cluster_size=config.hdbscan.min_cluster_size, min_samples=config.hdbscan.min_samples) #40, 40
clustered = clustering_model.fit_predict(dim_reduced)

In [None]:
df["clustered"] = clustered

In [None]:
print("num clusters:", len(df["clustered"].unique()))
df["clustered"].value_counts()

num clusters: 4


 2    16485
 0     7617
-1     3697
 1     2201
Name: clustered, dtype: int64

In [None]:
clustering_model.relative_validity_

0.23545746339357992

In [None]:
df[df["clustered"] == -1].iloc[-1]["review"]

'ive want buy product hold back cu product mar kay cost double price find product half price grab right away great product 60 percent cant go wrong grab today theyre go'

In [None]:
corpus = df["review"] # corpus 가 정의되어 있지 않길래, 추가했습니다. - 윤진
d={"review": corpus, "clustered": pd.Series(clustered)}
cluster_result = pd.DataFrame(data=d)

for i in range(len(pd.Series(clustered).unique())):
  print(f'{i}th cluster size: {len(cluster_result[cluster_result["clustered"] == i])}')

0th cluster size: 7617
1th cluster size: 2201
2th cluster size: 16485
3th cluster size: 0


# Topic Modeling (Using LDA)

In [None]:
df["review_tokenized"] = df["review"].apply(lambda x: x.split())
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05...","[0.44638398, 10.0415745, 4.4307847, 9.742665, ...",2,"[disregard, claim, hear, commercial, reduce, s..."
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112...","[8.751592, 6.6071258, 4.8410044, 9.5007305, 2....",0,"[price, pretty, good, find, nozzle, attachment..."
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041...","[1.8686926, 9.726857, 3.534628, 9.680706, 3.77...",2,"[look, like, harden, vasoline, come, tube, sme..."
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00...","[1.5704919, 6.175264, 6.5739202, 9.528315, 1.2...",2,"[absolutely, love, thing, small, face, short, ..."
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01...","[2.9159412, 6.8043437, 5.210582, 9.573075, 1.4...",2,"[love, set, brush, two, favorite, brush, set, ..."


In [None]:
def topic_modeling_lda(df):
    corpus = df["review_tokenized"]
    dictionary = corpora.Dictionary(corpus)
    corpus = [dictionary.doc2bow(text) for text in corpus]
    tokenized_text = df["review_tokenized"]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=config.lda.n_components,
                                                passes=config.lda.passes,
                                                random_state=config.random_state)
    # Compute Coherence Score
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=tokenized_text.tolist(), coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    return lda_model, corpus, dictionary, coherence_score

## Topic Modeling without clustering

In [None]:
lda_model, corpus, dictionary, coherence_score = topic_modeling_lda(df)
for topic in lda_model.print_topics(num_words=10):
    print(topic)
print()


pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary)

num_keywords = config.lda.num_keywords
keywords = vis.topic_info["Term"].tolist()[:num_keywords]
print("keywords: ", keywords)
print("c_v: ", coherence_score)


(0, '0.027*"look" + 0.023*"skin" + 0.020*"foundation" + 0.019*"use" + 0.019*"makeup" + 0.018*"eye" + 0.017*"apply" + 0.016*"powder" + 0.012*"light" + 0.012*"dark"')
(1, '0.046*"ingredient" + 0.033*"mask" + 0.028*"%" + 0.018*"water" + 0.017*"acid" + 0.015*"contain" + 0.015*"vitamin" + 0.015*"alcohol" + 0.014*"list" + 0.013*"natural"')
(2, '0.024*"use" + 0.018*"get" + 0.013*"product" + 0.012*"one" + 0.012*"like" + 0.011*"try" + 0.010*"time" + 0.010*"work" + 0.010*"would" + 0.008*"go"')
(3, '0.046*"color" + 0.029*"like" + 0.026*"smell" + 0.016*"scent" + 0.016*"love" + 0.013*"look" + 0.011*"one" + 0.010*"really" + 0.009*"nice" + 0.009*"wear"')
(4, '0.046*"hair" + 0.031*"iron" + 0.029*"dryer" + 0.025*"heat" + 0.019*"curl" + 0.019*"hot" + 0.017*"flat" + 0.011*"hold" + 0.010*"set" + 0.009*"straightener"')
(5, '0.077*"nail" + 0.043*"polish" + 0.033*"coat" + 0.025*"use" + 0.019*"dry" + 0.018*"top" + 0.012*"shadow" + 0.011*"base" + 0.010*"gel" + 0.010*"eyeliner"')
(6, '0.117*"hair" + 0.033*"use"

In [None]:
df["keywords"] = df.apply(lambda x: [word for word in keywords if word in x["review_tokenized"]], axis=1)

In [None]:
save_path = f"{workspace_path}/results/results_nocluster_lda.csv" #f"/content/drive/MyDrive/results_nocluster_lda.csv"
output_df = df.sort_index()
output_df[["review", "rawReview", "clustered", "keywords"]].to_csv(save_path)

## Topic Modeling with clustering

In [None]:
df["clustered"].value_counts()

 2    16485
 0     7617
-1     3697
 1     2201
Name: clustered, dtype: int64

In [None]:
# Sort by cluster_num
df = df.sort_values(by='clustered').copy()
df

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama...","[product, price]"
18894,95366,perfect size glass waste lot solution think us...,beauty,This is just the perfect size glass so you don...,0,"[-0.009471198, 0.0506283, 0.035999723, -0.0102...","[2.2803602, 5.3566923, 3.7887487, 9.289685, 3....",-1,"[perfect, size, glass, waste, lot, solution, t...",[use]
18895,14482,work greatand lot amounti love italso good pri...,beauty,Works greatAnd a lot of amountI love itAlso go...,0,"[-0.10554301, 0.0921953, 0.012891948, -0.03648...","[2.2972887, 6.8016005, 4.09537, 9.55347, 3.773...",-1,"[work, greatand, lot, amounti, love, italso, g...","[buy, love, good]"
18899,82512,buy store price want tale trip buy online love...,beauty,I bought this before at the store for the same...,0,"[-0.02053465, 0.046217334, 0.02266982, 0.01285...","[2.8735895, 7.0995092, 3.3644834, 9.75102, 4.7...",-1,"[buy, store, price, want, tale, trip, buy, onl...","[buy, price, love]"
4724,99370,awesome know marilyn monroe wear half lash co...,beauty,These are awesome. Did you know Marilyn Monro...,0,"[-0.061099887, -0.04258116, 0.024757992, -0.01...","[0.8221312, 7.1890125, 7.6376185, 9.655814, 0....",-1,"[awesome, know, marilyn, monroe, wear, half, l...","[love, look]"
...,...,...,...,...,...,...,...,...,...,...
16828,69685,satisfied natural shape eyebrow seem get bit ...,beauty,I am satisfied with the natural shape of my ey...,0,"[-0.07548773, -0.043068424, 0.031209582, -0.02...","[1.5585955, 6.346674, 6.4509315, 9.484528, 1.3...",2,"[satisfied, natural, shape, eyebrow, seem, get...","[brush, color, look, like, good]"
16824,93809,nice moisturizer heavy smell nice use daily ...,beauty,"This is a nice moisturizer.. not heavy, smells...",0,"[-0.10811445, 0.053995673, 0.07766143, 0.07239...","[1.2365701, 9.702916, 3.6363044, 9.685608, 3.5...",2,"[nice, moisturizer, heavy, smell, nice, use, d...","[product, use, buy, smell]"
16823,81922,combine colorstay make perfect shade one tell...,beauty,This combined with the colorstay makes my perf...,0,"[-0.008410755, -0.00843916, 0.04723039, -0.013...","[1.2870666, 7.481757, 4.583255, 9.666838, 1.46...",2,"[combine, colorstay, make, perfect, shade, one...",[one]
16711,64468,expect coverage concealer average loose powder...,beauty,I expected the coverage of this concealer to b...,0,"[-0.08217995, 0.051605124, 0.08166338, 0.05186...","[0.7726974, 7.872605, 5.1896505, 9.606297, 1.4...",2,"[expect, coverage, concealer, average, loose, ...",[skin]


In [None]:
keywords_by_cluster = []
coherence_scores = []
for cluster_num in sorted(df["clustered"].unique()):
    if cluster_num == -1:
        continue
    cluster_df = df[df["clustered"] == cluster_num]
    print(f"Cluster {cluster_num}({len(cluster_df)} documents):")
    lda_model, corpus, dictionary, coherence_score = topic_modeling_lda(cluster_df)


    for topic in lda_model.print_topics(num_words=10):
        print(topic)
    print()

    pyLDAvis.enable_notebook()
    vis = gensimvis.prepare(lda_model, corpus, dictionary)

    num_keywords = config.lda.num_keywords_per_cluster
    keywords = vis.topic_info["Term"].tolist()[:num_keywords]
    print("keywords: ", keywords)
    print("c_v: ", coherence_score)
    keywords_by_cluster.append(keywords)
    coherence_scores.append(coherence_score)

Cluster 0(7617 documents):
(0, '0.089*"hair" + 0.034*"use" + 0.026*"product" + 0.017*"dry" + 0.013*"leave" + 0.011*"conditioner" + 0.010*"oil" + 0.009*"soft" + 0.009*"love" + 0.009*"great"')
(1, '0.070*"color" + 0.044*"hair" + 0.017*"brown" + 0.017*"dye" + 0.015*"blonde" + 0.015*"red" + 0.014*"dark" + 0.013*"use" + 0.010*"bleach" + 0.010*"light"')
(2, '0.085*"oil" + 0.022*"scalp" + 0.021*"argan" + 0.015*"skin" + 0.013*"wax" + 0.012*"itch" + 0.011*"castor" + 0.010*"tea" + 0.010*"tree" + 0.008*"ingredient"')
(3, '0.054*"hair" + 0.043*"brush" + 0.015*"comb" + 0.012*"hold" + 0.012*"get" + 0.011*"great" + 0.011*"head" + 0.010*"use" + 0.010*"long" + 0.009*"one"')
(4, '0.013*"vinegar" + 0.009*"cider" + 0.006*"apple" + 0.005*"rash" + 0.005*"favorites" + 0.004*"content" + 0.004*"cancer" + 0.003*"consideration" + 0.003*"irritated" + 0.003*"laurel"')
(5, '0.100*"roller" + 0.078*"clip" + 0.041*"set" + 0.009*"these" + 0.008*"bun" + 0.008*"rollers" + 0.007*"roll" + 0.006*"snap" + 0.005*"blade" + 0.0

In [None]:
print(len(keywords_by_cluster))
np.mean(coherence_scores)

3


0.43243986635123965

In [None]:
for kwc in keywords_by_cluster:
    for kw in kwc[:10]:
        print(kw, end=' ')
    print()

hair color oil dryer dry 
coat color ~ cuticle use 
eye skin cream smell wash 


In [None]:
len(keywords_by_cluster[1] * 10)

50

In [None]:
df["keywords"] = df.apply(lambda x: [word for word in keywords_by_cluster[x["clustered"]] if word in x["review_tokenized"] and x["clustered"] != -1], axis=1)

In [None]:
df.sort_index()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05...","[0.44638398, 10.0415745, 4.4307847, 9.742665, ...",2,"[disregard, claim, hear, commercial, reduce, s...","[skin, smell]"
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112...","[8.751592, 6.6071258, 4.8410044, 9.5007305, 2....",0,"[price, pretty, good, find, nozzle, attachment...","[hair, dryer, dry]"
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041...","[1.8686926, 9.726857, 3.534628, 9.680706, 3.77...",2,"[look, like, harden, vasoline, come, tube, sme...",[smell]
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00...","[1.5704919, 6.175264, 6.5739202, 9.528315, 1.2...",2,"[absolutely, love, thing, small, face, short, ...",[eye]
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01...","[2.9159412, 6.8043437, 5.210582, 9.573075, 1.4...",2,"[love, set, brush, two, favorite, brush, set, ...",[]
...,...,...,...,...,...,...,...,...,...,...
29995,75758,try clean clear neutrogena biore product s...,beauty,"After trying Clean & Clear, Neutrogena, and Bi...",0,"[-0.022597803, -0.013498354, 0.07781014, 0.009...","[0.78196883, 10.365755, 4.7498198, 9.650059, 2...",2,"[try, clean, clear, neutrogena, biore, product...",[skin]
29996,89376,may look nice 8217 fooled soft fact stiff i...,beauty,They may look nice but don&#8217;t be fooled. ...,0,"[-0.09848054, 0.055580847, 0.12894471, -0.0191...","[4.0435786, 7.1127825, 5.1558385, 9.176137, 1....",-1,"[may, look, nice, 8217, fooled, soft, fact, st...",[]
29997,47962,strong sticky send back maybe good short male ...,beauty,SO STRONG AND STICKY. I had to send it back. ...,0,"[-0.08914643, -0.0042910976, 0.08933469, 0.012...","[8.173485, 6.5082984, 4.3910947, 9.491093, 3.6...",0,"[strong, sticky, send, back, maybe, good, shor...",[hair]
29998,21339,ive want buy product hold back cu product mar ...,beauty,ive been wanting to buy this product but I had...,0,"[-0.08350157, -0.012376309, 0.016681567, 0.000...","[2.0384207, 5.8280644, 3.2671359, 9.562258, 4....",-1,"[ive, want, buy, product, hold, back, cu, prod...",[]


In [None]:
save_path = f"{workspace_path}/results/results_hdbscan_lda.csv"# f"/content/drive/MyDrive/results_hdbscan_lda.csv"
output_df = df.sort_index()
output_df[["review", "rawReview", "clustered", "keywords"]].to_csv(save_path)

# Topic Modeling (Using C-TF-IDF;  Class-based TF-IDF)

C-TF-IDF를 통해서 각 묶어진 그룹(Topic 또는 Class)에 대해 해당 Topic을 잘 표현하는 단어를 찾는다.

In [None]:
%%capture
!pip install bertopic

In [None]:
# 열 추가
df["review_tokenized"] = df["review"].apply(lambda x: x.split())
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama...",[]
18894,95366,perfect size glass waste lot solution think us...,beauty,This is just the perfect size glass so you don...,0,"[-0.009471198, 0.0506283, 0.035999723, -0.0102...","[2.2803602, 5.3566923, 3.7887487, 9.289685, 3....",-1,"[perfect, size, glass, waste, lot, solution, t...",[]
18895,14482,work greatand lot amounti love italso good pri...,beauty,Works greatAnd a lot of amountI love itAlso go...,0,"[-0.10554301, 0.0921953, 0.012891948, -0.03648...","[2.2972887, 6.8016005, 4.09537, 9.55347, 3.773...",-1,"[work, greatand, lot, amounti, love, italso, g...",[]
18899,82512,buy store price want tale trip buy online love...,beauty,I bought this before at the store for the same...,0,"[-0.02053465, 0.046217334, 0.02266982, 0.01285...","[2.8735895, 7.0995092, 3.3644834, 9.75102, 4.7...",-1,"[buy, store, price, want, tale, trip, buy, onl...",[]
4724,99370,awesome know marilyn monroe wear half lash co...,beauty,These are awesome. Did you know Marilyn Monro...,0,"[-0.061099887, -0.04258116, 0.024757992, -0.01...","[0.8221312, 7.1890125, 7.6376185, 9.655814, 0....",-1,"[awesome, know, marilyn, monroe, wear, half, l...",[]


In [None]:
docs = df["review"].tolist()
# 실제 키워드 추출에 사용 (성능 향상을 위해 raw text 가 아닌 preprocessed text 사용)
# 단순히 키워드 빈도수만 체크되는 것이 아니기에, review_tokenized 가 아닌 (문장의 형태가 보존되어 있는) review 사용

docs[:3]

['obagi expensive total program search amazon best price product ',
 'perfect size glass waste lot solution think use shot glass  felt would waste almost twice solution get ',
 'work greatand lot amounti love italso good pricei wan na buy next time toogreat']

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
from bertopic import BERTopic

### Hyperparameter Tuning (10/20 Last Update)

**버토픽의 하이퍼파라미터 목록**

`top_n_words`: 각 토픽 별로 추출하고자 하는 단어의 개수 (확률분포값 기준, 상위 N개의 단어), 10과 20사이가 적절하고 값이 30을 넘기지 않는 것이 좋음. \
`n_gram_range`: 토픽 분포(representation)을 생성하는 CountVectorizer 에 반영되는 기준 단어 단위, 토픽별 추출되는 키워드를 구성하는 단어의 개수 \
`min_topic_size`: **(중요)** 하나의 토픽이 가져야 하는 최소 단어(키워드)의 개수, 이 값이 낮을수록 더 많은 토픽이 추출됨. (=제한조건이 낮으므로) / default 10  \
`nr_topics`: 토픽의 개수를 줄여서 결과적으로 남기고 싶은 토픽의 개수, 만약 "auto"로 설정한다면, 토픽의 개수를 HDBSCAN을 이용하여 자동적으로 특정 개수로 줄임. 너무 낮은 값으로 설정하면, 합쳐지지 말아야할 토픽들이 서로 합쳐져서 성능에 악영향을 미칠 수 있으니 주의. \


*`calculate_probabilities` 및 `low-memory` 하이퍼파라미터들은 일단 고려하지 않음.

각 하이퍼파라미터 세부설명 참고) \
 https://colab.research.google.com/drive/1ClTYut039t-LDtlcd-oQAdXWgcsSGTw9?usp=sharing#scrollTo=xLrIUdCGsgkf

### Extract Keywords

In [None]:
# 클러스터별 키워드 얻기

ctfidf_model = BERTopic(embedding_model=sentence_model, umap_model=mapper, hdbscan_model=clustering_model, top_n_words=5, min_topic_size=10).fit(docs) # min_topic_size 조절 필요 # 클러스터별 개수 문제
topics, _ = ctfidf_model.fit_transform(docs)

# Preprocess Documents
documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = ctfidf_model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and tokenizer from BERTopic
vectorizer = ctfidf_model.vectorizer_model
tokenizer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [tokenizer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in ctfidf_model.get_topic(topic)]
                  for topic in range(len(set(topics))-1)] # topic 이 하나일 경우 문제 발생
print(f"topic_words: {topic_words}")

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, # topic_words 가 0 개인 경우 문제
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v')

coherence_score = coherence_model.get_coherence()

# Print
print(f"\nTopic modeling is done.\n")


topic_words: [['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use']]

Topic modeling is done.



In [None]:
keywords = []

# topid_id 별 상위 5개의 단어 추출 (본 topic_id 는 클러스별이 아닌, 전체 문서에 대해 추출된 topic_id 이다.)
for topic_id, words in enumerate(topic_words):
  print(f"({topic_id}, '{words[0]}' + '{words[1]}' + '{words[2]}' + '{words[3]}' + '{words[4]}')")
  keywords += words

  # 개수 조정
  if (topic_id) > 8:
    break

keywords = list(set(keywords))
print()
print("keywords: ", keywords) # keywords - list 자료형
print("c_v: ", coherence_score)
print("\n")


(0, 'skin' + 'use' + 'product' + 'like' + 'face')
(1, 'hair' + 'use' + 'product' + 'shampoo' + 'like')
(2, 'nail' + 'polish' + 'coat' + 'color' + 'use')

keywords:  ['hair', 'color', 'face', 'polish', 'product', 'like', 'skin', 'coat', 'shampoo', 'use', 'nail']
c_v:  0.5593826174690636




In [None]:
# 각 클러스터별 키워드 추출 (ref: https://towardsdatascience.com/topics-per-class-using-bertopic-252314f2640)
# ref: https://maartengr.github.io/BERTopic/getting_started/topicsperclass/topicsperclass.html

classes = [df["clustered"][i] for i in range(len(df))]

topics_per_class = ctfidf_model.topics_per_class(docs, classes=classes)

keywords_by_cluster = topics_per_class["Words"].tolist()


In [None]:
ctfidf_model.visualize_topics_per_class(topics_per_class, top_n_topics=5)

In [None]:
# keywords_by_cluster 전처리

result = []

for item in keywords_by_cluster:
  result.append(item.split(', '))

print(result)

# keywords_by_cluster 대체
keywords_by_cluster = result

[['use', 'product', 'one', 'get', 'good'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use'], ['product', 'use', 'good', 'like', 'one'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use'], ['use', 'product', 'get', 'like', 'one'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use'], ['use', 'product', 'like', 'get', 'one'], ['skin', 'use', 'product', 'like', 'face'], ['hair', 'use', 'product', 'shampoo', 'like'], ['nail', 'polish', 'coat', 'color', 'use']]


In [None]:
# 각 리뷰(인스턴스) 별 keywords 추출 및 df 에 'kewords' 열 추가

df['keywords'] = None

for idx, row in df.iterrows():
    cluster_num = row["clustered"]

    if cluster_num == -1:
      cluster_num = 3

    keywords = keywords_by_cluster[cluster_num]
    row_keywords = [kw for kw in keywords if kw in row["review_tokenized"]]

    # Assign the row_keywords to the "keywords" column in the DataFrame
    df.at[idx, "keywords"] = row_keywords

print(df.head())  # To verify the changes


       Unnamed: 0                                             review category  \
19407       51391  obagi expensive total program search amazon be...   beauty   
18894       95366  perfect size glass waste lot solution think us...   beauty   
18895       14482  work greatand lot amounti love italso good pri...   beauty   
18899       82512  buy store price want tale trip buy online love...   beauty   
4724        99370  awesome know marilyn monroe wear half lash  co...   beauty   

                                               rawReview  category_id  \
19407  Obagi can be very expensive when you are doing...            0   
18894  This is just the perfect size glass so you don...            0   
18895  Works greatAnd a lot of amountI love itAlso go...            0   
18899  I bought this before at the store for the same...            0   
4724   These are awesome.  Did you know Marilyn Monro...            0   

                                                   sbert  \
19407  [-0.028

In [None]:
# 중간 확인
df.head()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
19407,51391,obagi expensive total program search amazon be...,beauty,Obagi can be very expensive when you are doing...,0,"[-0.028409513, 0.015774677, -0.008788007, -0.0...","[1.3318702, 7.96046, 4.08623, 9.220188, 3.5224...",-1,"[obagi, expensive, total, program, search, ama...",[]
18894,95366,perfect size glass waste lot solution think us...,beauty,This is just the perfect size glass so you don...,0,"[-0.009471198, 0.0506283, 0.035999723, -0.0102...","[2.2803602, 5.3566923, 3.7887487, 9.289685, 3....",-1,"[perfect, size, glass, waste, lot, solution, t...",[use]
18895,14482,work greatand lot amounti love italso good pri...,beauty,Works greatAnd a lot of amountI love itAlso go...,0,"[-0.10554301, 0.0921953, 0.012891948, -0.03648...","[2.2972887, 6.8016005, 4.09537, 9.55347, 3.773...",-1,"[work, greatand, lot, amounti, love, italso, g...",[]
18899,82512,buy store price want tale trip buy online love...,beauty,I bought this before at the store for the same...,0,"[-0.02053465, 0.046217334, 0.02266982, 0.01285...","[2.8735895, 7.0995092, 3.3644834, 9.75102, 4.7...",-1,"[buy, store, price, want, tale, trip, buy, onl...",[]
4724,99370,awesome know marilyn monroe wear half lash co...,beauty,These are awesome. Did you know Marilyn Monro...,0,"[-0.061099887, -0.04258116, 0.024757992, -0.01...","[0.8221312, 7.1890125, 7.6376185, 9.655814, 0....",-1,"[awesome, know, marilyn, monroe, wear, half, l...",[]


In [None]:
df["keywords"] = df.apply(lambda x: [word for word in keywords_by_cluster[x["clustered"]] if word in x["review_tokenized"] and x["clustered"] != -1], axis=1)

In [None]:
df.sort_index()

Unnamed: 0.1,Unnamed: 0,review,category,rawReview,category_id,sbert,dim_reduced,clustered,review_tokenized,keywords
0,60616,disregard claim hear commercial reduce stretch...,beauty,Disregard all the claims you've heard on comme...,0,"[-0.0021139544, 0.031628005, 0.053724263, 0.05...","[0.44638398, 10.0415745, 4.4307847, 9.742665, ...",2,"[disregard, claim, hear, commercial, reduce, s...","[use, product]"
1,72986,price pretty good find nozzle attachment fall ...,beauty,the price it's pretty good. I find the nozzle ...,0,"[-0.084313385, -0.02356731, 0.07364636, 0.0112...","[8.751592, 6.6071258, 4.8410044, 9.5007305, 2....",0,"[price, pretty, good, find, nozzle, attachment...","[use, get, good]"
2,15877,look like harden vasoline come tube smell like...,beauty,It looks like hardened Vasoline coming from th...,0,"[-0.046834804, -0.041300334, 0.03862773, 0.041...","[1.8686926, 9.726857, 3.534628, 9.680706, 3.77...",2,"[look, like, harden, vasoline, come, tube, sme...",[like]
3,85030,absolutely love thing small face short eyebrow...,beauty,I absolutely love this the only thing is that ...,0,"[-0.009914572, 0.012472473, 0.012321725, -0.00...","[1.5704919, 6.175264, 6.5739202, 9.528315, 1.2...",2,"[absolutely, love, thing, small, face, short, ...","[use, like]"
4,83073,love set brush two favorite brush set blush b...,beauty,I love this set of brushes! My two favorite br...,0,"[-0.12224284, -0.067056805, 0.024126083, -0.01...","[2.9159412, 6.8043437, 5.210582, 9.573075, 1.4...",2,"[love, set, brush, two, favorite, brush, set, ...",[]
...,...,...,...,...,...,...,...,...,...,...
29995,75758,try clean clear neutrogena biore product s...,beauty,"After trying Clean & Clear, Neutrogena, and Bi...",0,"[-0.022597803, -0.013498354, 0.07781014, 0.009...","[0.78196883, 10.365755, 4.7498198, 9.650059, 2...",2,"[try, clean, clear, neutrogena, biore, product...",[product]
29996,89376,may look nice 8217 fooled soft fact stiff i...,beauty,They may look nice but don&#8217;t be fooled. ...,0,"[-0.09848054, 0.055580847, 0.12894471, -0.0191...","[4.0435786, 7.1127825, 5.1558385, 9.176137, 1....",-1,"[may, look, nice, 8217, fooled, soft, fact, st...",[]
29997,47962,strong sticky send back maybe good short male ...,beauty,SO STRONG AND STICKY. I had to send it back. ...,0,"[-0.08914643, -0.0042910976, 0.08933469, 0.012...","[8.173485, 6.5082984, 4.3910947, 9.491093, 3.6...",0,"[strong, sticky, send, back, maybe, good, shor...",[good]
29998,21339,ive want buy product hold back cu product mar ...,beauty,ive been wanting to buy this product but I had...,0,"[-0.08350157, -0.012376309, 0.016681567, 0.000...","[2.0384207, 5.8280644, 3.2671359, 9.562258, 4....",-1,"[ive, want, buy, product, hold, back, cu, prod...",[]


In [None]:
# 각 리뷰별 키워드 추출된 상태의 데이터프레임 저장

save_path = f"{workspace_path}/results/results_hdbscan_ctfidf.csv"# f"/content/results_hdbscan_ctfidf_5.csv"
output_df = df.sort_index()
output_df[["review", "rawReview", "clustered", "keywords"]].to_csv(save_path)

In [None]:
# 끝