# Japanese Character Clustering using Text Imaging

The goal of this notebook is to test a topic modeling approach which groups characters based on appearance. This is based on the concept of radicals in Japanese text. 

## Setup

In [13]:
import numpy as np
import pandas as pd
import string

import matplotlib
import matplotlib.pyplot as plt
from PIL import Image as im

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
prop = fm.FontProperties(fname='NotoSerifJP-Regular.otf')

In [11]:
wiki_df = pd.read_pickle('wiki') 
wiki_df.head()

Unnamed: 0,article,views,text
0,メインページ,362562853,ようこそ\nウィキペディア - ウィキペディア日本語版 - 百科事典目次\n検索資料・ポータ...
1,星野源,10190763,星野 源（ほしの みなもと、1981年1月28日 - ）は、日本の音楽家、俳優、文筆家。埼玉...
2,真田信繁,9602104,真田 信繁（さなだ のぶしげ）は、安土桃山時代から江戸時代初期にかけての武将、大名。真田昌幸...
3,高橋一生,8571666,高橋 一生（たかはし いっせい、英字表記：Issey Takahashi、1980年12月9...
4,君の名は。,7788879,『君の名は。』（きみのなは、英: Your Name.）は、2016年に公開された新海誠監督...


In [45]:
stop = ['あそこ','あっ','あの','あのかた','あの人','あり','あります','ある','あれ','い','いう','います','いる','う','うち','え','お','および','おり','おります','か','かつて','から','が','き','ここ','こちら','こと','この','これ','これら','さ','さらに','し','しかし','する','ず','せ','せる','そこ','そして','その','その他','その後','それ','それぞれ','それで','た','ただし','たち','ため','たり','だ','だっ','だれ','つ','て','で','でき','できる','です','では','でも','と','という','といった','とき','ところ','として','とともに','とも','と共に','どこ','どの','な','ない','なお','なかっ','ながら','なく','なっ','など','なに','なら','なり','なる','なん','に','において','における','について','にて','によって','により','による','に対して','に対する','に関する','の','ので','のみ','は','ば','へ','ほか','ほとんど','ほど','ます','また','または','まで','も','もの','ものの','や','よう','より','ら','られ','られる','れ','れる','を','ん','何','及び','彼','彼女','我々','特に','私','私達','貴方','貴方方''ようこそ','ウィキペディア','ウィキペディア日本語版','百科事典目次','検索資料','空白','補助記号','）', '（', '。', '、', '『', '』', '・', '！','：', '／', '＋', '→', '「', '」', '\n', ' ', '-', '/', '\\']

In [15]:
data = np.array(wiki_df['text'])

## Data Cleaning

In [46]:
# Data Cleaning - remove all stop characters
new_data = []
for text in data:
    new_text = ""
    for character in text:
        if character not in stop:
            new_text += character
    new_data.append(new_text)

## Generate character BMPs and save pixel values to data frame

In [None]:
all_bmps = []
doc_num = 0

# Cycle through ***first 50*** documents in the data
for doc in new_data[0:50]:
    # For the ***first 100*** characters in the given document
    for character_idx in range(0,100):
        try:
            # Keep track of which characters belong to each document
            bmp_list = [doc_num, character_idx]

            # Create figure with given character
            fig = plt.figure(dpi=4)
            plt.imshow(np.zeros((1,1)),cmap='binary')
            plt.text(-0.45, 0.3, doc[character_idx], fontproperties=prop, fontsize=200)

            # Save the figure and convert to a BMP
            fig.savefig('tmp.PNG',dpi=4)
            im.open("tmp.PNG").save("tmp.bmp")
            #im.close() #this might break it
            plt.close()

            # Save the bytes of the image to the list of all BMP bytes
            with open('tmp.bmp','rb') as in_bmp:
                image_bytes = in_bmp.read()
            bmp_list += [int(image_bytes.hex()[i:i+2],16) for i in range(0,len(image_bytes.hex()),2)]
            all_bmps.append(bmp_list)
        except:
            pass
    doc_num += 1

# Create a data frame from the list of BMP bytes
df = pd.DataFrame(all_bmps)

In [26]:
df = pd.read_pickle('first50doc_first100char_bmps.pkl')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591
0,0,0,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
1,0,1,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
2,0,2,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
3,0,3,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
4,0,4,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,49,95,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
4996,49,96,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
4997,49,97,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0
4998,49,98,66,77,54,6,0,0,0,0,...,255,0,255,255,255,0,255,255,255,0


## PCA and K-Means Clustering

In [47]:
# Perform PCA to reduce number of components
pca = PCA(n_components=24)
reduced_data = pca.fit_transform(df[[i for i in range(2,1592)]])

# 240 radicals, so choosing 10% of that
kmeans = KMeans(init="k-means++", n_clusters=24)
kmeans.fit(reduced_data)

KMeans(n_clusters=24)

In [48]:
# Predict labels
labels = kmeans.predict(reduced_data)

In [None]:
# Print some label samples
for label in range(0,24):
    print("Label: " + str(label))
    label_indices = [i for i in range(len(labels)) if labels[i]==label]
    matching_chars = []
    for i in label_indices:
        label_doc = df[0].iloc[i]
        label_char = df[1].iloc[i]
        matching_chars.append(new_data[label_doc][label_char])
    print(matching_chars)
    print("\n")