# Multimodal Clustering Analysis

Sep 2019  
Dongmin Kim
[underkoo@kaist.ac.kr](mailto:underkoo@kaist.ac.kr)

**Expected input data:**
Process 900 dim latent feature of multi-modal instagram posts

**What this script does:**  
Explorative data analysis on the mult-modal latent. Checks for feature correlation (features = 900 dimesnsions), applies and visualized PCA, applies t-SNE on different data (raw, PCA, etc).


In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy as np
import pandas as pd
import os
import copy
import collections
import seaborn as sns; sns.set()
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import Birch, SpectralClustering, AffinityPropagation, AgglomerativeClustering, MiniBatchKMeans, KMeans, DBSCAN, OPTICS
import matplotlib.colors as colors
import math
import time
import config

In [None]:
# magic line to have in-notebook interactive plots
%matplotlib notebook

In [None]:
#If you want to use a korean words, unblock it!
# path_gothic = './fonts/NanumBarunGothic.ttf'
# fontprop1 = fm.FontProperties(fname=path_gothic, size=3)

In [None]:
CONFIG = config.Config

## load latent feature

In [None]:
# load raw data
target_csv = 'text_doc2vec_seoul_subway'
df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
print('complete csv shape: ' + str(df_data.shape))
print(df_data.iloc[:100])

In [None]:
print(df_data.describe())

In [None]:
# data seems to be well normalized to mean or median = 0 and std = 1
plt.figure()
df_data.boxplot()

In [None]:
df_normalized = df_data.div((np.sqrt(np.sum(np.square(df_data), axis=1))), axis=0)
df_normalized.to_csv(os.path.join(CONFIG.CSV_PATH, 'normalized_' + target_csv + '.csv'), encoding='utf-8-sig')
#df_normalized = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'normalized_' + target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
print('complete csv shape: ' + str(df_normalized.shape))
print(df_normalized.iloc[:100])

In [None]:
print(np.sqrt(np.sum(np.square(df_normalized), axis=1)))

In [None]:
print(df_normalized.describe())

In [None]:
# data seems to be well normalized to mean or median = 0 and std = 1
plt.figure()
df_normalized.boxplot()

## PCA as dimensionality reduction

In [None]:
def scatterplot_pointlabels(df_twocols, markersize=None):
    # basic scatterplot
    fig = plt.figure()
    plt.plot(df_twocols.iloc[:, 0], df_twocols.iloc[:, 1], marker='.', linestyle='None', markersize=markersize)

In [None]:
# PCA
pca = PCA(n_components=100)
df_pca_data = pd.DataFrame(pca.fit_transform(df_data))
df_pca_data.columns = ['PC' + str(i) for i in range(df_pca_data.shape[1])]
df_pca_data.index = df_data.index
print('singular_values_:\n' + str(pca.singular_values_))
print('components_:\n' + str(pca.components_))

# create PCA transformation DF
df_pca_transformation = pd.DataFrame(pca.components_).transpose()
df_pca_transformation.columns = ['PC' + str(i) for i in range(1, pca.n_components_ + 1)]
df_pca_transformation.index = df_data.columns
#print(df_pca_transformation)

In [None]:
# PCA: plot variance explained
plt.figure()
plt.plot(pca.explained_variance_ratio_, marker='o', linestyle=':')
plt.title('PCA: explained variance')

plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle=':')
plt.title('PCA: cumulative explained variance')

In [None]:
# plot PC dimensions
for dimen1 in range(0, 5):
    for dimen2 in range(dimen1 + 1, 5):
        # subset dataframe to the PC columns we need
        df_tmp = df_pca_data.iloc[:, [dimen1, dimen2]]
        
        # plot without labels
        scatterplot_pointlabels(df_tmp)
        # plt.title(df_tmp.columns)
        plt.xlabel(df_tmp.columns[0])
        plt.ylabel(df_tmp.columns[1])
        
        # plot with labels
        #scatterplot_pointlabels(df_tmp)
        # plt.title(df_tmp.columns)
        #plt.xlabel(df_tmp.columns[0])
        #plt.ylabel(df_tmp.columns[1])

In [None]:
# PCA
pca_normalized = PCA(n_components=100)
df_pca_normalized = pd.DataFrame(pca_normalized.fit_transform(df_normalized))
df_pca_normalized.columns = ['PC' + str(i) for i in range(df_pca_normalized.shape[1])]
df_pca_normalized.index = df_normalized.index
print('singular_values_:\n' + str(pca_normalized.singular_values_))
print('components_:\n' + str(pca_normalized.components_))

# create PCA transformation DF
df_pca_transformation_normalized = pd.DataFrame(pca_normalized.components_).transpose()
df_pca_transformation_normalized.columns = ['PC' + str(i) for i in range(1, pca_normalized.n_components_ + 1)]
df_pca_transformation_normalized.index = df_normalized.columns
#print(df_pca_transformation)

In [None]:
# PCA: plot variance explained
plt.figure()
plt.plot(pca_normalized.explained_variance_ratio_, marker='o', linestyle=':')
plt.title('PCA: explained variance')

plt.figure()
plt.plot(np.cumsum(pca_normalized.explained_variance_ratio_), marker='o', linestyle=':')
plt.title('PCA: cumulative explained variance')

In [None]:
# plot PC dimensions
for dimen1 in range(0, 5):
    for dimen2 in range(dimen1 + 1, 5):
        # subset dataframe to the PC columns we need
        df_tmp = df_pca_normalized.iloc[:, [dimen1, dimen2]]
        
        # plot without labels
        scatterplot_pointlabels(df_tmp)
        # plt.title(df_tmp.columns)
        plt.xlabel(df_tmp.columns[0])
        plt.ylabel(df_tmp.columns[1])
        
        # plot with labels
        #scatterplot_pointlabels(df_tmp)
        # plt.title(df_tmp.columns)
        #plt.xlabel(df_tmp.columns[0])
        #plt.ylabel(df_tmp.columns[1])

In [None]:
df_pca_data.to_csv(os.path.join(CONFIG.CSV_PATH, 'pca_' + target_csv + '.csv'), encoding='utf-8-sig')
#df_pca_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'pca_' + target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
df_pca_normalized.to_csv(os.path.join(CONFIG.CSV_PATH, 'pca_normalized_' + target_csv + '.csv'), encoding='utf-8-sig')
#df_pca_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'pca_' + target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
print(df_pca_data.iloc[:100])

## Downsample

In [None]:
def downsampling(df_data, num_samples=100):
#     idx = int(len(df_data)/num_samples)
#     idx_list = [x for x in range(0, len(df_data), idx)]
#     return df_data.iloc[idx_list]
    return df_data.sample(num_samples)

In [None]:
ds_data = downsampling(df_data, 10000)

In [None]:
# #ds_data.to_csv(os.path.join(CONFIG.CSV_PATH, 'ds_' + target_dataset + '.csv'), encoding='utf-8-sig')
# ds_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'ds_' + target_dataset + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
# print(ds_data)

## t-SNE

In [None]:
def do_tsne(tsne_object, data_to_pass):
    data_tsne = pd.DataFrame(tsne_object.fit_transform(data_to_pass))
    data_tsne.index = data_to_pass.index
    data_tsne.columns = [['tsne1', 'tsne2']]
    return data_tsne

In [None]:
# on pca
# TODO tune the learning rate and the iterations (rest should be ok i guess)
lr_list = [20, 100, 200]
perp_list = [50, 200, 500, 1000]
# lr_list = [200]
# perp_list = [50]
for lr in lr_list:
    for perp in perp_list:     
        start_time = time.time()
        #tsne_pca = do_tsne(TSNE(n_components=2, perplexity=50, early_exaggeration=12.0, learning_rate=100, n_iter=5000, random_state=42, verbose=1), df_pca_data)
        tsne_pca = do_tsne(MulticoreTSNE(n_components=2, perplexity=perp, learning_rate=lr, n_iter=2000, random_state=42, n_jobs=4, verbose=1), df_pca_normalized)
        print("time elapsed: " + str(time.time()-start_time) + " lr: " + str(lr) + " perp: " + str(perp))
        #scatterplot_pointlabels(tsne_pca, 0.2)
        #plt.title('t-SNE on PCA data lr: ' + str(lr) + ' perp: ' + str(perp) )
        #plt.savefig(os.path.join(CONFIG.SVG_PATH, 'tsne_pca_' + target_csv + '_' + str(lr) + '_' + str(perp) + '.svg'))
        tsne_pca.to_csv(os.path.join(CONFIG.CSV_PATH, 'tsne_pca_normalized_' + target_csv + '_' + str(lr) + '_' + str(perp) + '.csv'), encoding='utf-8-sig')

In [None]:
# plot t-SNE with PCA

for lr in lr_list:
    for perp in perp_list:     
        tsne_pca = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'tsne_pca_normalized_' + target_csv + '_' + str(lr) + '_' + str(perp) + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
        scatterplot_pointlabels(tsne_pca, 0.2)
        plt.title('t-SNE on PCA data lr: ' + str(lr) + ' perp: ' + str(perp) )
        plt.savefig(os.path.join(CONFIG.SVG_PATH, 'tsne_pca_' + target_csv + '_' + str(lr) + '_' + str(perp) + '.svg'))


In [None]:
#tsne_pca.to_csv(os.path.join(CONFIG.CSV_PATH, 'tsne_' + target_dataset + '.csv'), encoding='utf-8-sig')
#tsne_pca = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'tsne_' + target_csv + '.csv'), index_col=0, header=0, encoding='utf-8-sig')
tsne_pca = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'tsne_pca_normalized_text_latent_instagram0830_200_50.csv'), index_col=0, header=0, encoding='utf-8-sig')
tsne_pca = tsne_pca.iloc[1:]
print(tsne_pca.iloc[:100])

In [None]:
        # on raw data
# TODO tune the learning rate and the iterations (rest should be ok i guess)
# start_time = time.time()
# tsne_raw = do_tsne(MulticoreTSNE(n_components=2, perplexity=400, early_exaggeration=12.0, learning_rate=100, n_iter=10000, random_state=42, n_jobs=4), data)
# print("time elapsed: " + str(time.time()-start_time))

In [None]:
# plot t-SNE with raw data
# scatterplot_pointlabels(tsne_raw, plot_labels=False)
# plt.title('t-SNE on raw data')
# scatterplot_pointlabels(tsne_raw)
# plt.title('t-SNE on raw data')

## Spectral Clustering

In [None]:
#temp
start_time = time.time()
temp_clustering = KMeans(n_clusters = 24, random_state=42).fit(df_normalized)
labels = temp_clustering.labels_
print("time elapsed for clustering: " + str(time.time()-start_time))
start_time = time.time()
print(silhouette_score(df_normalized, labels, metric = 'cosine'))
print("time elapsed for silhouette_score: " + str(time.time()-start_time))

In [None]:
def clustering_score(X, max_num_clusters=10):
    sse = []
    sil = []
    for k in range(2, max_num_clusters + 1):
        print("current cluster: " + str(k))
        start_time = time.time()
        temp_clustering = KMeans(n_clusters = k, random_state=42).fit(X)
        print("time elapsed for clustering: " + str(time.time()-start_time))
        labels = temp_clustering.labels_
        sse.append(temp_clustering.inertia_)
        start_time = time.time()
        sil.append(silhouette_score(X, labels))
        print("time elapsed for silhouette_score: " + str(time.time()-start_time))
    return sse, sil

In [None]:
def sse_distance(sse):
    distance = []
    last = len(sse) - 1
    for i in range(1, last):
        _distance = abs((sse[last] - sse[0]) * i - (last - 0) * sse[i] + last * sse[0] - sse[last] * 0) / math.sqrt(math.pow((sse[last] - sse[0]),2) + math.pow((last - 0),2))
        distance.append(_distance)
    return distance

In [None]:
max_num_clusters = 30
sse, sil = clustering_score(df_normalized, max_num_clusters)
fig = plt.figure()
plt.plot(range(2, max_num_clusters + 1), sse, marker='o')
plt.xlabel('num of clusters')
plt.ylabel('SSE')
distance = sse_distance(sse)
fig = plt.figure()
plt.plot(range(3, max_num_clusters), distance, marker='o')
plt.xlabel('num of clusters')
plt.ylabel('SSE-distance')
fig = plt.figure()
plt.plot(range(2, max_num_clusters + 1), sil, marker='o')
plt.xlabel('num of clusters')
plt.ylabel('SIL')

In [None]:
num_clusters = 8

In [None]:
#labeling points of tnse clusters
def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']), fontsize=3, fontproperties=fontprop1)

In [None]:
def count_percentage(cluster_labels):
    count = collections.Counter(cluster_labels)
    for k in count:
        print("cluster {} : {:.2%}".format(str(k), count[k]/len(cluster_labels)))

In [None]:
def do_clustering(df, tsne):
        start_time = time.time()
        #clustering = Birch(n_clusters=21).fit(df_data)
        #clustering = DBSCAN(eps=3, min_samples=5).fit(tsne_pca)
        #clustering = SpectralClustering(n_clusters=num_clusters, random_state=42).fit(df)
        #clustering = AgglomerativeClustering(n_clusters=num_clusters).fit(data.loc[filtered_columns.index,:])# data, df_pca_data
        clustering = KMeans(n_clusters=num_clusters).fit(df_normalized)# data, df_pca_data
        count_percentage(clustering.labels_)
        print(clustering.labels_)
        print("time elapsed: " + str(time.time()-start_time))
        dist = copy.deepcopy(tsne)
        dist = dist.loc[df.index]
        dist['cluster'] = clustering.labels_
        return dist

In [None]:
#Spectral clustering
#For using 'amg' you should install pyamg in you local computer.
# Check the component at
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html
clustered_df = do_clustering(df_normalized, tsne_pca)

In [None]:
result_df = pd.DataFrame(data=clustered_df['cluster'], index=clustered_df.index, columns=['cluster'])
result_df.to_csv(os.path.join(CONFIG.CSV_PATH, 'clustered_kmeans_' + target_csv + '.csv'), encoding='utf-8-sig')
#clustered_df = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'clustered_agglo_complete_normalized_text_latent_instagram0830.csv'), index_col=0, header=0, encoding='utf-8-sig')
print(clustered_df.iloc[:100])

In [None]:
def plot_spectral_clustering(dataframe, markersize=None):
    # plot spectrac clustering result
    # without label
    #scatterplot_pointlabels(dataframe[['x', 'y']], markersize)
    # better colors
    plt.figure()
    color_dict = {v: k for v, k in enumerate(sns.color_palette("Paired", num_clusters))}
    sns_plot = sns.scatterplot(x="x", y="y", hue='cluster_id', palette=color_dict, data=dataframe, marker='.', s=markersize)
    plt.savefig(os.path.join(CONFIG.SVG_PATH, 'clustering_' + target_csv + '.svg'))

In [None]:
# put clustering info into df
df_clustered_spectral = pd.DataFrame({'x': tsne_pca.tsne1.squeeze(), 
                                      'y': tsne_pca.tsne2.squeeze(), 
                                      'cluster_id': clustered_df.cluster.squeeze()})
# debug prints
#print(df_clustered_spectral.dtypes)
plot_spectral_clustering(df_clustered_spectral, 30)       
#ds_clustered_spectral = downsampling(df_clustered_spectral, 5000)
#plot_spectral_clustering(ds_clustered_spectral, 0.2)       
# print(df_clustered_spectral.iloc[:10, :])
# df_clustered_spectral['cluster_id'].describe() # 30 integer clusters