# 8.7 문서 군집화(Document Clustering)
## 8.7.1 문서 군집화 개념
- 비슷한 텍스트 구성의 문서를 같은 카테고리 소속으로 분류
- 비지도 학습 기반
    - 문서의 결정 카테고리값이 없음
    - 기존 클러스터링 방법 적용

## 8.7.2 Opinion Review 데이터 세트를 이용한 문서 군집화 수행하기

* 데이터
    - UCI 머신러닝 레파지토리 내 Opinion Review 데이터
        - 51개 텍스트 파일(문서)
        - 각 문서는 Tripadvisor(호텔), Edmunds.com(자동차), Amazon.com(전자제품) 사이트에서 가져온 리뷰
        - 각 문서는 100개정도 문장 가짐
        - 토픽 모델링에도 사용 가능
        - http://archive.ics.uci.edu/ml/datasets/Opinosis+Opinion+%26frasl%3B+Review
            - topics 폴더내 51개 파일 구성

In [18]:
!ls -al

total 296328
drwxr-xr-x  63 csg  staff      2016  3 20 15:49 [1m[36m.[m[m
drwxr-xr-x  13 csg  staff       416  3 10 00:11 [1m[36m..[m[m
-rw-r--r--@  1 csg  staff      6148  3 20 15:48 .DS_Store
drwxr-xr-x   4 csg  staff       128  3 20 14:39 [1m[36m.ipynb_checkpoints[m[m
-rw-r--r--   1 csg  staff     60843  3 20 15:01 8.5 감성 분석_csg.ipynb
-rw-r--r--   1 csg  staff      6527 10 27 18:28 8.6 토픽 모델링(Topic Modeling) - 20 뉴스그룹.ipynb
-rw-r--r--   1 csg  staff     63748  3 20 15:49 8.7 문서 군집화 소개와 실습(Opinion Review 데이터 세트).ipynb
drwxr-xr-x@  4 csg  staff       128  3 20 15:48 [1m[36mOpinosisDataset1.0[m[m
-rw-r--r--   1 csg  staff      6553  3  6  2010 accuracy_garmin_nuvi_255W_gps.txt.data
-rw-r--r--@  1 csg  staff      8399  3  6  2010 bathroom_bestwestern_hotel_sfo.txt.data
-rw-r--r--   1 csg  staff      9753  3  6  2010 battery-life_amazon_kindle.txt.data
-rw-r--r--   1 csg  staff      6368  3  6  2010 battery-life_ipod_nano_8gb.txt.data
-rw-r--r--   1 csg  staf

In [49]:
import pandas as pd
import glob ,os

# 컴퓨터에서 압축 파일을 풀어 놓은 디렉토리이니, 여러분의 디렉토리를 설정해 주십시요  
# path로 지정한 디렉토리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
# path = r'C:\Users\chkwon\Text\OpinosisDataset1.0\OpinosisDataset1.0\topics'                     
# all_files = glob.glob(os.path.join(path, "*.data"))  

path = '.'                     
file_list = os.path.join(path, "*.data")
all_files = glob.glob(file_list)   
#print(all_files)
#print(len(all_files))

In [58]:
# 개별 파일들의 파일명은 filename_list 리스트로 취합, 
# 개별 파일들의 파일내용은 DataFrame로딩 후 다시 string으로 변환하여 opinion_text 리스트로 취합 
filename_list = [] 
opinion_text = []
for file_ in all_files:
    # 개별 파일을 읽어서 DataFrame으로 생성 
    df = pd.read_table(file_, index_col=None, header=0,encoding='latin1')
    
    # 절대경로로 주어진 file 명을 가공. 만일 Linux에서 수행시에는 아래 \\를 / 변경. 맨 마지막 .data 확장자도 제거
    #filename_ = file_.split('\\')[-1]
    #filename = filename_.split('.')[0]
    filename = file_.replace("./", "").replace(".txt.data", "")
    #print(filename)
 
    #파일명 리스트와 파일내용 리스트에 파일명과 파일 내용을 추가. 
    filename_list.append(filename)
    opinion_text.append(df.to_string())
    
# 파일명 리스트와 파일내용 리스트를  DataFrame으로 생성
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head() # filenaem으로 opinion의 text가 어떤 제품/서비스인지 알수 있음 -> 주제

Unnamed: 0,filename,opinion_text
0,battery-life_ipod_nano_8gb,short battery life I moved up from a...
1,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fa...
2,room_holiday_inn_london,"We arrived at 23,30 hours and they could n..."
3,location_holiday_inn_london,Great location for tube and we crammed in...
4,staff_bestwestern_hotel_sfo,Staff are friendly and hel...


In [61]:
document_df['opinion_text'][0]

"         short battery life  I moved up from an 8gb .\n0      I love this ipod except for the battery life .\n1                     long battery  scratch resistant\n2             Battery drains even if I don't use it .\n3    I only wonder why the battery seems to drain ...\n4    great in the car, light, portable, good quali...\n5   5G lies a more mature iPod, many steps wiser a...\n6   5GB and the better battery life rated for up t...\n7    battery, not dummie proof  I love the color o...\n8    Battery life isn't amazing, Definitely need t...\n9    The battery doesn't last a long time especial...\n10   doesn't take pictures, battery life  It's my ...\n11   Also, the battery life wasn't that long enoug...\n12   When you use the radio it uses a lot of batte...\n13   good quality  light  long battery  portable  ...\n14   scratches easily, short battery life  I purch...\n15   Price battery replacement  The iPod basically...\n16   I do not like it that I will have to get Appl...\n17   I th

In [70]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
print(string.punctuation)

lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict))) # 소문자 변환, 특수문자 제거, 단어 토큰화, 단어 원형

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 문서 피쳐 벡터화: TF-IDF 
# tokenizer: LemNormalize 이용(LemNormalization 구현)
tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english' , \
                             ngram_range=(1,2), min_df=0.05, max_df=0.85 )

#opinion_text 컬럼값으로 feature vectorization 수행 -> 개별 text에 대해 변환된 피처 벡터화 행렬 반환
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])
print(feature_vect.shape) #(51, 2409)

  'stop_words.' % sorted(inconsistent))


(51, 2409)


* 방법
    - 데이터
        - 문서별 텍스트가 TF-IDF 변환된 피쳐 벡터화 행렬 데이터
        - 문서 유형: 전자제품(네비게이션, 아이팟, 킨들, 랩탑 컴퓨터...), 자동차, 호텔
    - 군집화 기법: k-means
      - 5개 중심(centroid)으로 시작
      - 최대 반복 횟수: 10000

In [82]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예제를 위해 동일한 클러스터링 결과 도출용 random_state=0 //시드값 
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_ # 군집의 라벨값
cluster_centers = km_cluster.cluster_centers_ # 중심별로 할당된 데이터 세트의 좌표값
print(cluster_centers.shape)
print(cluster_centers)

(5, 2409)
[[0.         0.00686546 0.         ... 0.         0.         0.        ]
 [0.02332668 0.         0.         ... 0.         0.         0.00208647]
 [0.00425961 0.         0.         ... 0.         0.         0.        ]
 [0.         0.00160316 0.00240348 ... 0.00306654 0.00328859 0.        ]
 [0.01583394 0.         0.         ... 0.         0.         0.00644769]]


In [89]:
document_df['cluster_label'] = cluster_label
document_df.head() #filename(주제)과 cluster_label(군집화 결과)비교

Unnamed: 0,filename,opinion_text,cluster_label
0,battery-life_ipod_nano_8gb,short battery life I moved up from a...,1
1,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fa...,0
2,room_holiday_inn_london,"We arrived at 23,30 hours and they could n...",3
3,location_holiday_inn_london,Great location for tube and we crammed in...,3
4,staff_bestwestern_hotel_sfo,Staff are friendly and hel...,3


In [90]:
document_df[document_df['cluster_label']==0].sort_values(by='filename') # 0번으로 군집화된 것중 filename 정렬하여 비교 

Unnamed: 0,filename,opinion_text,cluster_label
1,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fa...,0
35,mileage_honda_accord_2008,"It's quiet, get good gas mileage and look...",0


In [91]:
document_df[document_df['cluster_label']==1].sort_values(by='filename') # 1번으로 군집화된 것중 filename 정렬하여 비교 

Unnamed: 0,filename,opinion_text,cluster_label
9,battery-life_amazon_kindle,After I plugged it in to my USB hub on my ...,1
0,battery-life_ipod_nano_8gb,short battery life I moved up from a...,1
11,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 2...",1
12,keyboard_netbook_1005ha,", I think the new keyboard rivals the gre...",1
15,performance_netbook_1005ha,The Eee Super Hybrid Engine utility lets u...,1
24,sound_ipod_nano_8gb,headphone jack i got a clear case for it a...,1
14,video_ipod_nano_8gb,"I bought the 8, gig Ipod Nano that has the...",1


In [14]:
document_df[document_df['cluster_label']==2].sort_values(by='filename') # 2번으로 군집화된 것중 filename 정렬하여 비교 

Unnamed: 0,filename,opinion_text,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very acc...",2
5,buttons_amazon_kindle,I thought it would be fitting to christen ...,2
8,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken ...,2
9,display_garmin_nuvi_255W_gps,3 quot widescreen display was a ...,2
10,eyesight-issues_amazon_kindle,It feels as easy to read as the K1 but doe...,2
11,features_windows7,"I had to uninstall anti, virus and selecte...",2
12,fonts_amazon_kindle,Being able to change the font sizes is aw...,2
23,navigation_amazon_kindle,"In fact, the entire navigation structure h...",2
33,satellite_garmin_nuvi_255W_gps,It's fast to acquire satel...,2
34,screen_garmin_nuvi_255W_gps,It is easy to read and when touching the...,2


In [92]:
document_df[document_df['cluster_label']==3].sort_values(by='filename') # 3번으로 군집화된 것중 filename 정렬하여 비교 

Unnamed: 0,filename,opinion_text,cluster_label
31,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and...",3
17,food_holiday_inn_london,The room was packed to capacity with queu...,3
32,food_swissotel_chicago,The food for our event was deli...,3
49,free_bestwestern_hotel_sfo,The wine reception is a great idea as it i...,3
39,location_bestwestern_hotel_sfo,"Good Value good location , ideal ...",3
3,location_holiday_inn_london,Great location for tube and we crammed in...,3
50,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is ...,3
41,price_amazon_kindle,"If a case was included, as with the Kindle...",3
28,price_holiday_inn_london,"All in all, a normal chain hotel on a nice...",3
2,room_holiday_inn_london,"We arrived at 23,30 hours and they could n...",3


In [93]:
document_df[document_df['cluster_label']==4].sort_values(by='filename') # 4번으로 군집화된 것중 filename 정렬하여 비교 

Unnamed: 0,filename,opinion_text,cluster_label
33,accuracy_garmin_nuvi_255W_gps,", and is very, very acc...",4
26,buttons_amazon_kindle,I thought it would be fitting to christen ...,4
34,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken ...,4
48,display_garmin_nuvi_255W_gps,3 quot widescreen display was a ...,4
36,eyesight-issues_amazon_kindle,It feels as easy to read as the K1 but doe...,4
21,features_windows7,"I had to uninstall anti, virus and selecte...",4
44,fonts_amazon_kindle,Being able to change the font sizes is aw...,4
38,navigation_amazon_kindle,"In fact, the entire navigation structure h...",4
10,satellite_garmin_nuvi_255W_gps,It's fast to acquire satel...,4
8,screen_garmin_nuvi_255W_gps,It is easy to read and when touching the...,4


In [95]:
from sklearn.cluster import KMeans

# 교재의 경우 군집 개수가 많아 세분화되어 군집화되었다 판단 -> 중심 개수를 낮춰 재군집화 
# 3개의 집합으로 군집화 
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_


# 소속 클러스터를 cluster_label 컬럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label'] = cluster_label
#document_df.sort_values(by='cluster_label')

In [96]:
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
33,accuracy_garmin_nuvi_255W_gps,", and is very, very acc...",0
9,battery-life_amazon_kindle,After I plugged it in to my USB hub on my ...,0
0,battery-life_ipod_nano_8gb,short battery life I moved up from a...,0
11,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 2...",0
26,buttons_amazon_kindle,I thought it would be fitting to christen ...,0
34,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken ...,0
48,display_garmin_nuvi_255W_gps,3 quot widescreen display was a ...,0
36,eyesight-issues_amazon_kindle,It feels as easy to read as the K1 but doe...,0
21,features_windows7,"I had to uninstall anti, virus and selecte...",0
44,fonts_amazon_kindle,Being able to change the font sizes is aw...,0


In [97]:
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
18,comfort_honda_accord_2008,"Drivers seat not comfortable, the car its...",1
43,comfort_toyota_camry_2007,Ride seems comfortable and gas mileage fa...,1
1,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fa...,1
45,interior_honda_accord_2008,I love the new body style and the interior...,1
22,interior_toyota_camry_2007,"First of all, the interior has way too ma...",1
35,mileage_honda_accord_2008,"It's quiet, get good gas mileage and look...",1
47,performance_honda_accord_2008,"Very happy with my 08 Accord, performance i...",1
42,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which ...,1
29,seats_honda_accord_2008,Front seats are very uncomfor...,1
23,transmission_toyota_camry_2007,"After slowing down, transmission has to b...",1


In [98]:
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
31,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and...",2
17,food_holiday_inn_london,The room was packed to capacity with queu...,2
32,food_swissotel_chicago,The food for our event was deli...,2
49,free_bestwestern_hotel_sfo,The wine reception is a great idea as it i...,2
39,location_bestwestern_hotel_sfo,"Good Value good location , ideal ...",2
3,location_holiday_inn_london,Great location for tube and we crammed in...,2
50,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is ...,2
28,price_holiday_inn_london,"All in all, a normal chain hotel on a nice...",2
2,room_holiday_inn_london,"We arrived at 23,30 hours and they could n...",2
46,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , Helpless...",2


## 8.7.3 군집(Cluster)별 핵심 단어 추출하기
- 각 군집에 속한 문서는 핵심 단어를 주축으로 군집화 -> 군집의 핵심 단어 확인
- KMeans 객체 cluster_centers_ 속성
    - 각 군집을 구성하는 단어 피처가 군집의 중심(centroid)기준으로 얼마나 가깝게 위치한지 나타냄
    - 행: 개별 군집
    - 열: 개별 피처
    - 각 배열값
        - 개별 군집 내 상대 위치를 숫자 값으로 표현한 일종의 좌표값 (예) c[0,1] = 0번 군집에서 2번째 피처의 위치값
        - 개별 중심과 얼마나 가까운지 상대값으로 나타냄, 0~1 값을 가짐 -> 1일 수록 중심에 가까움  

In [18]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape :',cluster_centers.shape)
print(cluster_centers)

cluster_centers shape : (3, 2409)
[[0.01819865 0.         0.         ... 0.         0.         0.00471073]
 [0.         0.00170335 0.0025537  ... 0.0032582  0.00349413 0.        ]
 [0.         0.00137309 0.         ... 0.         0.         0.        ]]


In [100]:
# 군집별 top n 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명들을 반환함. 
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}
    
    # cluster_centers array 의 값이 큰 순으로 정렬된 index 값을 반환
    # 군집 중심점(centroid)별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함.  
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    #개별 군집별로 iteration하면서 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명 입력
    for cluster_num in range(clusters_num):
        
        # 개별 군집별 정보를 담을 데이터 초기화. 
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        # cluster_centers_.argsort()[:,::-1] 로 구한 index 를 이용하여 top n 피처 단어를 구함. 
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [ feature_names[ind] for ind in top_feature_indexes ]
        
        # top_feature_indexes를 이용해 해당 피처 단어의 중심 위치 상댓값 구함 
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()
        
        # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심위치 상대값, 그리고 해당 파일명 입력
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames
        
    return cluster_details


In [113]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('####### Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
        print('--------------------------')
        #print('Top features value:', cluster_detail['top_features_value'])
        print('Reviews 파일명 개수 :',len(cluster_detail['filenames']))
        print('Reviews 파일명 :',cluster_detail['filenames']) #[:7]
        print('==================================================')


In [114]:
feature_names = tfidf_vect.get_feature_names()
#print("feature_names:", feature_names)

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df,\
                                  feature_names=feature_names, clusters_num=3, top_n_features=10 )

print_cluster_details(cluster_details)

####### Cluster 0
Top features: ['screen', 'battery', 'life', 'battery life', 'keyboard', 'kindle', 'size', 'button', 'easy', 'voice']
--------------------------
Reviews 파일명 개수 : 25
Reviews 파일명 : ['battery-life_ipod_nano_8gb', 'voice_garmin_nuvi_255W_gps', 'speed_garmin_nuvi_255W_gps', 'size_asus_netbook_1005ha', 'screen_garmin_nuvi_255W_gps', 'battery-life_amazon_kindle', 'satellite_garmin_nuvi_255W_gps', 'battery-life_netbook_1005ha', 'keyboard_netbook_1005ha', 'video_ipod_nano_8gb', 'performance_netbook_1005ha', 'updates_garmin_nuvi_255W_gps', 'features_windows7', 'sound_ipod_nano_8gb', 'screen_ipod_nano_8gb', 'buttons_amazon_kindle', 'accuracy_garmin_nuvi_255W_gps', 'directions_garmin_nuvi_255W_gps', 'eyesight-issues_amazon_kindle', 'screen_netbook_1005ha', 'navigation_amazon_kindle', 'speed_windows7', 'price_amazon_kindle', 'fonts_amazon_kindle', 'display_garmin_nuvi_255W_gps']
####### Cluster 1
Top features: ['interior', 'seat', 'mileage', 'comfortable', 'car', 'gas', 'transmissi