<a href="https://colab.research.google.com/github/kobemawu/www/blob/master/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 課題１: 以下の要件を満たすプログラムを作成
- 100個以上の文書をクラスタリング
- K-means手法を利用

## 必要なライブラリ・データセットのインポート

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections

nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/junya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/junya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 百人一首のデータを読み込み

In [2]:
import glob
import os
import re

# Ogura Hyakunin Isshu
DATA_PATH = './100poets/100poets.txt'

def cleaning_text(text):
    # remove ".,!?:;" 
    pattern1 = '[,.!\?;:]'
    text = re.sub(pattern1, '', text)    
    # remove "--"
    pattern2 = '--'
    text = re.sub(pattern2, '', text)    
    # remove "\n"
    pattern3 = '\\n'
    text = re.sub(pattern3, '', text)    
    return text

# read file and set docs
f = open(DATA_PATH)
docs=[cleaning_text(line).split() for line in f]
f.seek(0, os.SEEK_SET)
org=[cleaning_text(line) for line in f]
f.close()

#print(docs[:1])
#print(org[:1])
print("num of docs:", len(docs))

num of docs: 100


## 前処理

In [3]:
# create stop word list
en_stop = nltk.corpus.stopwords.words('english')

# add additional stop word
en_stop= ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<"] \
         + ["0","1","2","3","4","5","6","7","8","9"] \
         + en_stop

# create preprocessing function
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

def preprocess_word(word, stopwordset):
    
    #1.make words lower ex: Python =>python
    word=word.lower()
    
    #2.remove "," and "."
    if word in [",","."]:
        return None
    
    #3.remove stopword  ex: the => (None) 
    if word in stopwordset:
        return None
    
    #4.lemmatize  ex: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: #lemmatizeしたものがstopwordである可能性がある
        return None
    else:
        return lemma    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

# preprocessing
pre_docs = preprocess_documents(docs)

#print(pre_docs[0]) # for debug

## クラスタリング

### (a) BoW(Bag of Words)によるベクトル化

In [4]:
# define bow vectorizer
def bow_vectorizer(docs):
  word2id = {}
  for doc in docs:
    for w in doc:
      if w not in word2id:
        word2id[w] = len(word2id)
        
  result_list = []
  for doc in docs:
    doc_vec = [0] * len(word2id)
    for w in doc:
      doc_vec[word2id[w]] += 1
    result_list.append(doc_vec)
  return result_list, word2id

# vectorize by BoW
bow_vec, word2id = bow_vectorizer(pre_docs)

# for debug
#print(bow_vec)
#word2id.items()

result = [max(vector) for vector in bow_vec]
print("max number of elemenet: ", max(result))
print("number of words: ", len(word2id))

max number of elemenet:  3
number of words:  669


### (b) TF-IDF(Term Frequency - Inverse Document Frequency)によるベクトル化
#### vectorizerの作成（ハイパーパラメーターの設定）

In [5]:
# create vectorizer, (https://gotutiyan.hatenablog.com/entry/2020/09/10/181919)
vectorizer = TfidfVectorizer(max_features=int(len(word2id)*0.80), token_pattern=u'(?u)\\b\\w+\\b' )

#### 語彙の獲得、idfの計算、tf-idf行列の計算

In [6]:
pre_docs = [" ".join(doc) for doc in pre_docs]

# fit & transform
tf_idf = vectorizer.fit_transform(pre_docs)

# for debug
#print(tf_idf[0])
#print(tf_idf[0].toarray())

## K-means

### クラスタリングの性能評価:シルエット分析

In [7]:
import numpy as np

from sklearn import preprocessing
tf_idf = preprocessing.normalize(tf_idf)

# define kmeans method
def kmeans(n_clust):
  km=KMeans(n_clusters=n_clust,
            init="k-means++",
            n_init=10,
            max_iter=300,
            random_state=0)
  y_km=km.fit_predict(tf_idf)
  return y_km

def n_label(y_km):
  cluster_labels=np.unique(y_km)
  n_clusters=cluster_labels.shape[0]
  return [cluster_labels,n_clusters]

#### 最適なクラスタ数を探索

In [8]:
from sklearn.metrics import silhouette_samples
import statistics

best_number_of_clusters = 0

# search
for k in range(2,15):
    y_km = kmeans(k)
    nn_l = n_label(y_km)
    cluster_labels = nn_l[0]
    n_clusters = nn_l[1]

    # calc. silhouette
    silhouette_vals=silhouette_samples(tf_idf,y_km,metric='cosine')

    sil = [] # number of samples in each cluster
    for i,c in enumerate(cluster_labels):
        c_silhouette_vals=silhouette_vals[y_km==c]
        sil.append(len(c_silhouette_vals))

    # find best clusters
    if max(sil)-min(sil) < 7:
        print("    best of k:%d"%int(i+1), "var:", '{:.2g}'.format(statistics.pvariance(sil)), sil)
        best_number_of_clusters = i+1
    else:
        print("not best of k:%d"%int(i+1),"var:", '{:.2g}'.format(statistics.pvariance(sil)), sil)

not best of k:2 var: 3.6e+02 [69, 31]
not best of k:3 var: 1.6e+02 [51, 25, 24]
not best of k:4 var: 9 [30, 22, 24, 24]
not best of k:5 var: 50 [17, 28, 13, 29, 13]
not best of k:6 var: 26 [16, 26, 18, 10, 12, 18]
not best of k:7 var: 19 [19, 13, 16, 15, 10, 7, 20]
not best of k:8 var: 12 [13, 19, 15, 12, 13, 6, 10, 12]
not best of k:9 var: 9.2 [15, 8, 13, 8, 9, 11, 17, 10, 9]
    best of k:10 var: 4.4 [7, 7, 10, 12, 11, 9, 13, 9, 9, 13]
not best of k:11 var: 6.1 [8, 11, 7, 11, 13, 13, 9, 6, 9, 6, 7]
    best of k:12 var: 4.2 [6, 9, 11, 10, 6, 10, 11, 10, 5, 8, 6, 8]
not best of k:13 var: 9.6 [3, 7, 3, 8, 9, 14, 7, 13, 7, 9, 8, 5, 7]
not best of k:14 var: 11 [9, 9, 8, 7, 7, 8, 12, 4, 3, 3, 5, 15, 7, 3]


#### クラスタ毎に表示

In [9]:
km = KMeans(n_clusters=best_number_of_clusters,
        init="k-means++",
        n_init=10,
        max_iter=300,
        random_state=0)
y_km=km.fit_predict(tf_idf)

for cluster in range(0, best_number_of_clusters):
    print('---------- Cluster', cluster, ' ----------')
    for index in range(0, len(org)):
        if y_km[index] == cluster:
            print(index+1, ':', org[index])

---------- Cluster 0  ----------
17 : Even when the gods Held sway in the ancient days I have never heard That water gleamed with autumn red As it does in Tatta's stream
31 : At the break of day Just as though the morning moon Lightened the dim scene Yoshino's village lay In a haze of falling snow
37 : In the autumn fields When the heedless wind blows by Over the pure-white dew How the myriad unstrung gems Are scattered everywhere around
58 : As Mount Arima Sends its rustling winds across Ina's bamboo plains I will be just as steadfast And never will forget you
94 : From Mount Yoshino Blows a chill autumnal wind In the deepening night The ancient village shivers Sounds of beating cloth I hear
100 : In this ancient house Paved with a hundred stones Ferns grow in the eaves But numerous as they are My old memories are more
---------- Cluster 1  ----------
7 : When I look up at The wide-stretched plain of heaven Is the moon the same That rose on Mount Mikasa In the land of Kasuga
12 : Let 