# Multimodal Clustering Analysis

Sep 2019  
Dongmin Kim
[underkoo@kaist.ac.kr](mailto:underkoo@kaist.ac.kr)

**Expected input data:**
Process 900 dim latent feature of multi-modal instagram posts

**What this script does:**  
Explorative data analysis on the mult-modal latent. Checks for feature correlation (features = 900 dimesnsions), applies and visualized PCA, applies t-SNE on different data (raw, PCA, etc).


In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import numpy as np
import pandas as pd
import os
import seaborn as sns; sns.set()
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import SpectralClustering, AffinityPropagation, AgglomerativeClustering, KMeans
import matplotlib.colors as colors
import math
import time
import config

In [None]:
# magic line to have in-notebook interactive plots
%matplotlib notebook

In [None]:
#If you want to use a korean words, unblock it!
# path_gothic = './fonts/NanumBarunGothic.ttf'
# fontprop1 = fm.FontProperties(fname=path_gothic, size=3)

In [None]:
CONFIG = config.Config

## load latent feature

In [None]:
# load raw data
target_dataset = 'instagram0830'
df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, 'latent_' + target_dataset + '.csv'), index_col=0, header=None, encoding='utf-8-sig')
df_data.index.name = 'short_code'
#data_all = pd.read_csv('data/2019_05_01_naver_word2vec/word2vecBLOG)_1000.csv.zip', index_col=0, header=None)
print('complete csv shape: ' + str(df_data.shape))
print(df_data)

In [None]:
print(df_data.describe())

In [None]:
# data seems to be well normalized to mean or median = 0 and std = 1
plt.figure()
df_data.boxplot()

## compute similarity between features (feature correlation, etc)

In [None]:
# correlation of features
df_corr = df_data.corr()

In [None]:
# plot CC
plt.figure()
fig = sns.heatmap(df_corr, cmap=sns.color_palette("coolwarm", 7))
fig.set_yticklabels(fig.get_yticklabels(), rotation=0)
plt.title('CC of features')

In [None]:
# pairwise cosine distance 
df_cosine = cosine_similarity(df_data.transpose())

In [None]:
# plot heatmap: cosine
plt.figure()
fig = sns.heatmap(df_cosine, cmap=sns.color_palette("coolwarm", 7))
fig.set_yticklabels(fig.get_yticklabels(), rotation=0)
plt.title('pairwise cosine distance of features')

In [None]:
# pairwise cosine and dotproduct of columns
# pairwise dotproduct has no normalization whatsoever: might help in not making two small numbers too similar
df_dotproduct = np.zeros((df_data.shape[1], df_data.shape[1]))
for xpos in range(df_data.shape[1]):
    for ypos in range(xpos, df_data.shape[1]):
        df_dotproduct[ypos, xpos] = (df_data.iloc[:, xpos] * df_data.iloc[:, ypos]).mean()
# make DF out of it
df_dotproduct = pd.DataFrame(df_dotproduct)
df_dotproduct.columns = df_data.columns
df_dotproduct.index = df_data.columns

In [None]:
# plot heatmap: dotproduct
plt.figure()
fig = sns.heatmap(df_dotproduct, cmap=sns.color_palette("coolwarm", 7))
fig.set_yticklabels(fig.get_yticklabels(), rotation=0)
plt.title('pairwise dotproduct of features')

## PCA as dimensionality reduction

In [None]:
# PCA
pca = PCA(n_components=200) # 40 with instagram
df_pca_data = pd.DataFrame(pca.fit_transform(df_data))
df_pca_data.columns = ['PC' + str(i) for i in range(df_pca_data.shape[1])]
df_pca_data.index = df_data.index
print('singular_values_:\n' + str(pca.singular_values_))
print('components_:\n' + str(pca.components_))

# create PCA transformation DF
df_pca_transformation = pd.DataFrame(pca.components_).transpose()
df_pca_transformation.columns = ['PC' + str(i) for i in range(1, pca.n_components_ + 1)]
df_pca_transformation.index = df_data.columns
#print(df_pca_transformation)

In [None]:
# PCA: plot variance explained
plt.figure()
plt.plot(pca.explained_variance_ratio_, marker='o', linestyle=':')
plt.title('PCA: explained variance')

plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle=':')
plt.title('PCA: cumulative explained variance')

In [None]:
# print pca data
plt.figure()
fig=sns.heatmap(df_pca_data, cmap=sns.color_palette("coolwarm", 7))
#fig.set_yticklabels(fig.get_yticklabels(), rotation=0)

In [None]:
# print df_pca_transformation
plt.figure()
fig=sns.heatmap(df_pca_transformation, cmap=sns.color_palette("coolwarm", 7))
fig.set_yticklabels(fig.get_yticklabels(), rotation=0)

In [None]:
def scatterplot_pointlabels(df_twocols, plot_labels=True, markersize=None):
    # basic scatterplot
    fig = plt.figure()
    plt.plot(df_twocols.iloc[:, 0], df_twocols.iloc[:, 1], marker='.', linestyle='None', markersize=markersize)

In [None]:
# plot PC dimensions
for dimen1 in range(0, 5):
    for dimen2 in range(dimen1 + 1, 5):
        # subset dataframe to the PC columns we need
        df_tmp = df_pca_data.iloc[:, [dimen1, dimen2]]
        
        # plot without labels
        scatterplot_pointlabels(df_tmp, plot_labels=False)
        # plt.title(df_tmp.columns)
        plt.xlabel(df_tmp.columns[0])
        plt.ylabel(df_tmp.columns[1])
        
        # plot with labels
        #scatterplot_pointlabels(df_tmp)
        # plt.title(df_tmp.columns)
        #plt.xlabel(df_tmp.columns[0])
        #plt.ylabel(df_tmp.columns[1])

## t-SNE

In [None]:
def do_tsne(tsne_object, data_to_pass):
    data_tsne = pd.DataFrame(tsne_object.fit_transform(data_to_pass))
    data_tsne.index = data_to_pass.index
    data_tsne.columns = [['tsne1', 'tsne2']]
    return data_tsne

In [None]:
# on pca
# TODO tune the learning rate and the iterations (rest should be ok i guess)
start_time = time.time()
tsne_pca = do_tsne(TSNE(n_components=2, perplexity=50, early_exaggeration=12.0, learning_rate=100, n_iter=5000, random_state=42), df_pca_data)
print("time elapsed: " + str(time.time()-start_time))

In [None]:
# plot t-SNE with PCA
scatterplot_pointlabels(tsne_pca)
plt.title('t-SNE on PCA data')
plt.savefig(os.path.join(CONFIG.SVG_PATH, 'tsne_pca_' + target_dataset + '.svg'))


In [None]:
# on raw data
# TODO tune the learning rate and the iterations (rest should be ok i guess)
# start_time = time.time()
# tsne_raw = do_tsne(MulticoreTSNE(n_components=2, perplexity=400, early_exaggeration=12.0, learning_rate=100, n_iter=10000, random_state=42, n_jobs=4), data)
# print("time elapsed: " + str(time.time()-start_time))

In [None]:
# plot t-SNE with raw data
# scatterplot_pointlabels(tsne_raw, plot_labels=False)
# plt.title('t-SNE on raw data')
# scatterplot_pointlabels(tsne_raw)
# plt.title('t-SNE on raw data')

## Spectral Clustering

In [None]:
num_clusters = 6

In [None]:
#labeling points of tnse clusters
def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']), fontsize=3, fontproperties=fontprop1)

In [None]:
#Spectral clustering
#For using 'amg' you should install pyamg in you local computer.
# Check the component at
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html
start_time = time.time()
clustering = SpectralClustering(n_clusters=num_clusters, eigen_solver='amg', assign_labels="discretize", 
                            affinity= 'nearest_neighbors', n_neighbors=6, random_state=42, n_jobs=4).fit(df_data)
#clustering = AgglomerativeClustering(n_clusters=num_clusters).fit(data.loc[filtered_columns.index,:])# data, df_pca_data
# clustering = KMeans(n_clusters=num_clusters).fit(data.loc[filtered_columns.index,:])# data, df_pca_data
print("time elapsed: " + str(time.time()-start_time))
cluster_list = np.array(clustering.labels_).tolist()
tsne_pca['cluster'] = cluster_list

In [None]:
print(tsne_pca)
tsne_pca.to_csv(os.path.join(CONFIG.CSV_PATH, 'clustered_' + target_dataset + '.csv'), encoding='utf-8-sig')

In [None]:
def plot_spectral_clustering(dataframe, is_selected=False):
    # plot spectrac clustering result
    # without label
    scatterplot_pointlabels(dataframe[['x', 'y']], plot_labels=False)
    # better colors
    plt.figure(figsize=(13.2,9.6))
    color_dict = {v: k for v, k in enumerate(sns.color_palette("Paired", num_clusters))}
    sns_plot = sns.scatterplot(x="x", y="y", hue='cluster_id', palette=color_dict, data=dataframe, marker='.', s=128)
    plt.savefig('spectralclustering_wordlabel_colors.svg')

In [None]:
# put clustering info into df
df_clustered_spectral = pd.DataFrame({'x': tsne_pca.tsne1.squeeze(), 
                                      'y': tsne_pca.tsne2.squeeze(), 
                                      'word': tsne_pca.index.tolist(), 
                                      'cluster_id': clustering.labels_})
# debug prints
# print(df_clustered_spectral.shape)
print(df_clustered_spectral.dtypes)
plot_spectral_clustering(df_clustered_spectral)       
# print(df_clustered_spectral.iloc[:10, :])
# df_clustered_spectral['cluster_id'].describe() # 30 integer clusters