### Imports

In [1]:
import pickle
import os, os.path
import numpy as np
import scipy.stats
import time
import matplotlib.pyplot as plt
import colorio
import colorsys
import matplotlib.colors as colors
from PIL import Image

import scipy
import pylab
import matplotlib.cm as cm
import seaborn as sns
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import squareform
from scipy.spatial import ConvexHull

from mpl_toolkits.mplot3d import Axes3D
cmap = cm.plasma(np.linspace(0.0, 1, 3))
sch.set_link_color_palette([colors.rgb2hex(rgb[:3]) for rgb in cmap])

In [None]:
jzazbz_map = np.load('jzazbz_array.npy', encoding = 'latin1')

def rgb_array_to_jzazbz_array(rgb_array):
    jzazbz_array = np.zeros(rgb_array.shape)
    for i in range(rgb_array.shape[0]):
        for j in range(rgb_array.shape[1]):
            jzazbz_array[i][j] = jzazbz_map[rgb_array[i][j][0]][rgb_array[i][j][1]][rgb_array[i][j][2]]
    return jzazbz_array

In [None]:
def get_img_dict(words, path_to_words, compress=True, compress_dim=300, complexity_dim=3):
    img_dict = {}
    img_array_dict = {}
    img_array_complexity_dict = {}
    for word in words:
        img = []
        img_array = []
        img_array_complexity = []
        folder = path_to_words + '{}'.format(word)
        pics = [name for name in os.listdir(folder) if os.path.isfile(os.path.join(folder, name))]
        for pic in pics:
            filename = 'downloads/{}/{}'.format(word,pic)
            if os.path.splitext(filename)[1] != '.svg':
                try:
                    img_raw = Image.open(filename)
                    if compress == True:
                        img_compress = img_raw.resize((compress_dim,compress_dim),Image.ANTIALIAS)
                        if np.shape(np.array(img_compress)) == (compress_dim, compress_dim, 3):
                            img.append(img_compress)
                            img_array.append(np.array(img_compress))
                            img_compress_complexity = img_raw.resize((complexity_dim,complexity_dim),Image.ANTIALIAS)
                            img_array_complexity.append(np.array(img_compress_complexity))
                except:
                    pass
        img_dict[word] = img
        img_array_dict[word] = img_array
        img_array_complexity_dict[word] = img_array_complexity
    return img_dict, img_array_dict, img_array_complexity_dict

In [None]:
def get_color_distributions(img_dict, jzazbz=True, hsv=True, rgb=True, spacing=36):
    jzazbz_dict = {}
    jzazbz_dict_dist = {}
    hsv_dict = {}
    rbg_dict = {}
    if jzazbz == True:
        for key in img_dict:
            jzazbz = []
            dist_array = []
            for i in range(len(img_dict[key])):
                jzazbz_temp = rgb_array_to_jzazbz_array(img_array_dict[key][i])
                jzazbz.append(jzazbz_temp)
                dist = np.ravel(np.histogramdd(np.reshape(jzazbz_temp[:,:,:],(90000,3)), 
                                      bins=(np.linspace(0,0.167,3),np.linspace(-0.1,0.11,3),
                                           np.linspace(-0.156,0.115,3)), density=True)[0])
                dist_array.append(dist)
            jzazbz_dict[key] = jzazbz
            jzazbz_dict_dist[key] = dist_array#jzazbz
            #distribution_dict[key] = dist_array
    if hsv == True:
        h_dict, s_dict, v_dict = {}, {}, {}
        for key in img_dict:
            dist_array, h, s, v = [], [], [], []
            for i in range(len(img_dict[key])):
                hsv_array = colors.rgb_to_hsv(img_array_dict[key][i]/255.)
                dist = np.histogram(360.*np.ravel(hsv_array[:,:,0]),
                                    bins=np.arange(0,360+spacing,spacing),
                                    density=True)[0]
                dist_array.append(dist)
                h.append(np.mean(np.ravel(hsv_array[:,:,0])))
                s.append(np.mean(np.ravel(hsv_array[:,:,1])))
                v.append(np.mean(np.ravel(hsv_array[:,:,2])))
            hsv_dict[key] = dist_array
            #h_dict[key], s_dict[key], v_dict[key] = h, s, v
    if rgb == True:
        rgb_dict = {}
        rgb_dict_dist = {}
        for key in img_dict:
            rgb = []
            dist_array = []
            for i in range(len(img_dict[key])):
                r = np.sum(np.ravel(img_dict[key][i][:,:,0]))
                g = np.sum(np.ravel(img_dict[key][i][:,:,1]))
                b = np.sum(np.ravel(img_dict[key][i][:,:,2]))
                tot = 1.*r+g+b
                rgb.append([r/tot,g/tot,b/tot])
                dist = np.ravel(np.histogramdd(np.reshape(img_array_dict[key][i],(90000,3)), 
                                      bins=(np.linspace(0,255,3),np.linspace(0,255,3),
                                           np.linspace(0,255,3)), density=True)[0])
                dist_array.append(dist)
            rgb_dict[key] = rgb
            rgb_dict_dist[key] = dist_array
#        return jzazbz_dict, distribution_dict, h_dict, s_dict, v_dict, rgb_dict
    return jzazbz_dict, jzazbz_dict_dist, hsv_dict, rgb_dict, rgb_dict_dist

In [None]:
def compress_img_array(words, img_array_dict, compress_dim=300):
    compressed_img_array_dict = {}
    for word in words:
        compressed_img_array = np.zeros((compress_dim,compress_dim,3))
        for n in range(len(img_array_dict[word])):
            if np.shape(img_array_dict[word][n]) == (compress_dim, compress_dim, 3):
                for i in range(compress_dim):
                    for j in range(compress_dim):
                        compressed_img_array[i][j] += img_array_dict[word][n][i][j]/(1.*len(img_array_dict[word]))
        compressed_img_array_dict[word] = compressed_img_array
    return compressed_img_array_dict

In [None]:
def cross_entropy_between_images(rgb_dict, symmetrized=True):
    entropy_dict = {}
    entropy_dict_js = {}
    for key in rgb_dict:
        entropy_array = []
        entropy_array_js = []
        for i in range(len(rgb_dict[key])):
            for j in range(len(rgb_dict[key])):
                if symmetrized == True:
                    mean = (rgb_dict[key][i] + rgb_dict[key][j])/2.
                    entropy_array.append((scipy.stats.entropy(rgb_dict[key][i],rgb_dict[key][j])+scipy.stats.entropy(rgb_dict[key][j],rgb_dict[key][i]))/2.)
                    entropy_array_js.append((scipy.stats.entropy(rgb_dict[key][i],mean) + scipy.stats.entropy(rgb_dict[key][j],mean))/2.)
                else:
                    entropy_array.append(scipy.stats.entropy(rgb_dict[key][i],rgb_dict[key][j]))
        entropy_dict[key] = entropy_array
        entropy_dict_js[key] = entropy_array_js
    return entropy_dict, entropy_dict_js

In [None]:
def cross_entropy_between_labels(rgb_dict, words, symmetrized=True):
    mean_rgb_dict = {}
    for key in rgb_dict:
        mean_rgb_array = np.mean(np.array(rgb_dict[key]),axis=0)
        mean_rgb_dict[key] = mean_rgb_array
    labels_entropy_dict = {}
    labels_entropy_dict_js = {}
    color_sym_matrix = []
    color_sym_matrix_js = []
    for word1 in words:
        row = []
        row_js = []
        for word2 in words:
            if symmetrized == True:
                mean = (mean_rgb_dict[word1] + mean_rgb_dict[word2])/2.
                entropy = (scipy.stats.entropy(mean_rgb_dict[word1],mean_rgb_dict[word2])+scipy.stats.entropy(mean_rgb_dict[word2],mean_rgb_dict[word1]))/2.
                entropy_js = (scipy.stats.entropy(mean_rgb_dict[word1],mean) + scipy.stats.entropy(mean_rgb_dict[word2],mean))/2.
            else:
                entropy = scipy.stats.entropy(mean_rgb_dict[word1],mean_rgb_dict[word2])
                entropy_js = []
            row.append(entropy)
            row_js.append(entropy_js)
            labels_entropy_dict[word1 + word2] = entropy
            labels_entropy_dict_js[word1 + word2] = entropy_js
        color_sym_matrix.append(row)
        color_sym_matrix_js.append(row_js)
    return labels_entropy_dict, color_sym_matrix, labels_entropy_dict_js, color_sym_matrix_js

### Load Data

In [None]:
jzazbz_dict_discipline = {}
rgb_dict_discipline = {}
entropy_dict_js_discipline = {}
cross_entropy_dict_js_discipline = {}

In [None]:
with open('super_disc.pickle', 'rb') as handle:
    disciplines_pickle = pickle.load(handle)#, encoding="latin1")
    
with open('sub_disc.pickle', 'rb') as handle:
    subdisciplines_pickle = pickle.load(handle)#, encoding="latin1")

In [None]:
path_to_words = 'Data/categories/disciplines/'
discipline_words = disciplines_pickle.keys()

img_dict, img_array_dict, img_array_complexity_dict = get_img_dict(discipline_words, path_to_words)
jzazbz_dict, jzazbz_dict_dist, hsv_dict, rgb_dict, rgb_dict_dist = get_color_distributions(img_array_dict, jzazbz=True, spacing=36)
entropy_dict, entropy_dict_js = cross_entropy_between_images(jzazbz_dict_dist)
cross_entropy_between_labels_dict, cross_entropy_matrix, cross_entropy_between_labels_dict_js, cross_entropy_matrix_js = cross_entropy_between_labels(jzazbz_dict_dist, discipline_words, symmetrized=True)
compressed_img_array_dict = compress_img_array(discipline_words, img_array_dict)

jzazbz_dict_discipline['discipline'] = jzazbz_dict
rgb_dict_discipline['discipline'] = rgb_dict
entropy_dict_js_discipline['discipline'] = entropy_dict_js
cross_entropy_dict_js_discipline['discipline'] = cross_entropy_matrix_js

In [None]:
jzazbz_dict_discipline['subdiscipline'] = {}
entropy_dict_js_discipline['subdiscipline'] = {}
cross_entropy_dict_js_discipline['subdiscipline'] = {}

In [None]:
for word in discipline_words:
    path_to_words = 'Data/categories/subdisciplines/{}/'.format(word)
    subdiscipline_words = subdisciplines_pickle[word].keys()
    img_dict, img_array_dict, img_array_complexity_dict = get_img_dict(subdiscipline_words, path_to_words)
    jzazbz_dict, jzazbz_dict_dist, hsv_dict, rgb_dict, rgb_dict_dist = get_color_distributions(img_array_dict, jzazbz=True, spacing=36)
    entropy_dict, entropy_dict_js = cross_entropy_between_images(jzazbz_dict_dist)
    cross_entropy_between_labels_dict, cross_entropy_matrix, cross_entropy_between_labels_dict_js, cross_entropy_matrix_js = cross_entropy_between_labels(jzazbz_dict_dist, subdiscipline_words, symmetrized=True)
    jzazbz_dict_discipline['subdiscipline'][word] = jzazbz_dict
    entropy_dict_js_discipline['subdiscipline'][word] = entropy_dict_js
    cross_entropy_dict_js_discipline['subdiscipline'][word] = cross_entropy_matrix_js

### Analysis

In [None]:
avg_dist_dict = {}
for word in discipline_words:
    avg_dist = np.mean(jzazbz_dict_dist[word],axis=0)
    avg_dist_dict[word] = avg_dist
    
X = np.zeros((len(discipline_words),8))
i = 0
for word in discipline_words:
    X[i] = avg_dist_dict[word]
    i +=1
    
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

labels = kmeans.predict(X)
centroids = kmeans.cluster_centers_

avg_rgb_dict = {}
for word in discipline_words:
    avg_rgb = np.mean(np.mean(np.mean(jzazbz_dict[word],axis=0),axis=0),axis=0)#np.mean(disciplinejzazbz_dict[word],axis=0)
    avg_rgb_dict[word] = avg_rgb

In [None]:
colorsmap = map(lambda x: {1: 'r', 0: 'b', 2: 'g', 3:'m', 4:'k', 5:'brown'}, labels)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

i =0
for word in ['mathematics']:#discipline_words:
    ax.scatter(avg_rgb_dict[word][0], avg_rgb_dict[word][1], avg_rgb_dict[word][2], 
               c=colorsmap[0][labels[i]], label=word, marker='o')#colorsmap[0][labels[i]]
    i+=1
    
#encircle(x2, y2, ec="orange", fc="none")

ax.legend(loc=1,bbox_to_anchor=(2, 1.05),ncol=2)
ax.set_xlabel('r')
ax.set_ylabel('g')
ax.set_zlabel('b')

plt.savefig('discipline_clustering.png')
plt.show()

In [None]:
D = np.log2(np.exp(np.matrix(cross_entropy_dict_js_discipline['discipline'])))
condensedD = squareform(D)

# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(10,10))
ax1 = fig.add_axes([0.162,0.1,0.125,0.6])
Y = sch.linkage(condensedD, method='centroid')
Z1 = sch.dendrogram(Y, orientation='left', above_threshold_color='dimgrey')

ax1.set_xticks([])
ax1.set_yticks([])
ax1.axis('off')

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.125])
Y = sch.linkage(condensedD, method='centroid')
Z2 = sch.dendrogram(Y, above_threshold_color='dimgrey')
ax2.set_xticks([])
ax2.set_yticks([])
ax2.axis('off')

# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves'][::-1]
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=sns.cubehelix_palette(light=1, as_cmap=True, hue=0.),
                     vmin=D.min(),vmax=D.max())
axmatrix.set_xticks([])
axmatrix.set_yticks([])

axmatrix.set_xticks(range(len(discipline_words_plot)))
axmatrix.set_xticklabels(np.array(discipline_words_plot)[idx1], minor=False, fontsize=13)
axmatrix.xaxis.set_label_position('bottom')
axmatrix.xaxis.tick_bottom()

pylab.xticks(rotation=-90)

axmatrix.set_yticks(range(len(discipline_words_plot)))
axmatrix.set_yticklabels(np.array(discipline_words_plot)[idx2], minor=False, fontsize=13)
axmatrix.yaxis.set_label_position('right')
axmatrix.yaxis.tick_right()

#axcolor = fig.add_axes([0.94,0.1,0.02,0.6])
# Plot colorbar.
axcolor = fig.add_axes([1.05,0.1,0.02,0.6])
cbar = pylab.colorbar(im, cax=axcolor)
cbar.ax.set_yticks([0,0.005,0.01,0.015,0.02,0.025,0.03])
cbar.ax.set_yticklabels(['0','','0.01','','0.02','',0.03],fontsize=10)
cbar.set_label('Jensen-Shannon Divergence [bits]', labelpad=24,rotation=270, fontsize=16)
fig.show()
fig.savefig('dendrogram.png')

In [None]:
discipline_branching_factor = np.zeros(len(discipline_words))
discipline_precision = np.zeros(len(discipline_words))
discipline_entropy = np.zeros(len(discipline_words))

i = 0
for word in discipline_words:
    discipline_branching_factor[i] = disciplines_pickle[word]['branching_fact']
    discipline_precision[i] = disciplines_pickle[word]['precision']
    discipline_entropy[i] = np.mean(np.array(entropy_dict_js_discipline['discipline'][word]))
    i+=1

In [None]:
plt.xscale('log')
plt.yscale('log')
plt.scatter(discipline_branching_factor, discipline_entropy)
#plt.scatter(genre_branching_factor, genre_entropy)

plt.xticks([1,10,100], fontsize=10)
plt.yticks([0.2,0.3,0.4], fontsize=10)
plt.gca().set_yticklabels([r'0.2',r'0.3',r'0.4'],minor=True, fontsize=10)

plt.xlabel(r'WordNet Branching Factor',fontsize=20, labelpad=8)
plt.ylabel('Within-Label\n JS Divergence [bits]',fontsize=20, labelpad=8)

In [None]:
plt.figure(figsize=(8,6))
#plt.hist(np.ravel(cross_entropy_matrix_js),color='r',normed=True,alpha=0.5, label='physics subdisciplines')
#plt.hist(np.ravel(biocross_entropy_matrix_js),color='k',alpha=0.5,normed=True, label='bio subdisciplines')
plt.hist(np.ravel(np.log2(np.exp(cross_entropy_matrix_js_rand))),color='k',alpha=0.5,linewidth=2.,histtype='step',normed=True, label='random nouns')
plt.hist(np.ravel(np.log2(np.exp(np.matrix(cross_entropy_dict_js_discipline['discipline'])))),color='b',normed=True, label='disciplines', alpha=0.35)
plt.hist(np.ravel(np.log2(np.exp(np.matrix(cross_entropy_dict_js_discipline['subdiscipline']['physics'])))),color='g',normed=True, label='subdisciplines', alpha=0.35)
#plt.hist(genres,color='g',normed=True, bins=5, label='music genres', alpha=0.3)

plt.xlim(-0.025,0.65)
plt.yscale('log')
plt.legend(loc=1, frameon=False, fontsize=16)
#plt.xlim(0.01,0.25)

plt.xticks([0,0.15,0.3,0.45,0.6], fontsize=12)
plt.yticks([0.1,1.0,10], fontsize=12)

plt.xlabel(r'Jensen-Shannon Divergence [bits]', fontsize=18)
plt.ylabel(r'$\mathcal{P}(\rm{JS\ Divergence})$', fontsize=18)
plt.title('Perceptually Uniform Binning', fontsize=20)
plt.show()