In [1]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import joblib
import seaborn as sns
sns.set_palette(sns.color_palette("hls", 12))

%matplotlib inline

In [9]:
inc = joblib.load('./with_movie/inception_representations.joblib')
pca = joblib.load('./with_movie/pca_representation.joblib')
res = joblib.load('./with_movie/resnet_representations.joblib')
inc_fine = joblib.load("./fine_tuned_inception_xxx.joblib")
res_fine = joblib.load("./fine_tuned_resnet_xxx.joblib")
_, labels, _ = joblib.load('./with_movie/labels.joblib')

In [10]:
pca.shape

(963, 963)

In [11]:
label_order =  []
for label in labels:
    if label not in label_order:
        label_order.append(label)

In [12]:
import matplotlib.patches as mpatches
def plot_data(X, labels, title=''):
    
    # find 2 dim representation 
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(X)
    x, y = zip(*X_tsne)
    

    # plotting code 
    cm = []
    colors = sns.color_palette()
    for label in labels:
        idx = label_order.index(label)
        cm.append(colors[idx])
    plt.scatter(x,y,color=cm, marker='.')
    frame1 = plt.gca()
    frame1.axes.xaxis.set_ticklabels([])
    frame1.axes.yaxis.set_ticklabels([])
    patches = []
    for label in sorted(label_order):
        patches.append( mpatches.Patch(color=colors[label_order.index(label)], label=label))
    lgd = plt.legend(handles=patches, loc='upper left', prop=dict(size=14), bbox_to_anchor=(1.02, 1))
    plt.title(title)
    plt.savefig('{}.png'.format(title), bbox_extra_artists=(lgd,), bbox_inches="tight")

In [None]:
plot_data(pca, labels, "PCA'd raw image representations")

In [None]:
plot_data(inc, labels, "InceptionNet image representations")

In [None]:
plot_data(inc_fine, labels, "InceptionNet Finetuned image representations")

In [None]:
plot_data(res, labels, "ResNet image representations")

In [None]:
plot_data(res_fine, labels, "ResNet Finetuned image representations")

In [13]:
from sklearn.cluster import KMeans

In [14]:
import numpy as np
def purity_score(clusters, classes):
    
    A = np.c_[(clusters,classes)]

    n_accurate = 0.

    for j in np.unique(A[:,0]):
        z = A[A[:,0] == j, 1]
        x = np.argmax(np.bincount(z))
        n_accurate += len(z[z == x])

    return n_accurate / A.shape[0]

In [15]:
kmeans = KMeans(n_clusters=11, n_jobs=-1)
pca_cluster = kmeans.fit_transform(pca)
purity_score(kmeans.labels_.astype(int), np.array([label_order.index(l) for l in labels]).astype(int))

0.37175493250259606

In [16]:
kmeans = KMeans(n_clusters=11, n_jobs=-1)
cluster = kmeans.fit_transform(inc)
purity_score(kmeans.labels_.astype(int), np.array([label_order.index(l) for l in labels]).astype(int))

0.3582554517133956

In [17]:
kmeans = KMeans(n_clusters=11, n_jobs=-1)
cluster = kmeans.fit_transform(inc_fine)
purity_score(kmeans.labels_.astype(int), np.array([label_order.index(l) for l in labels]).astype(int))

0.3790238836967809

In [18]:
kmeans = KMeans(n_clusters=11, n_jobs=-1)
cluster = kmeans.fit_transform(res)
purity_score(kmeans.labels_.astype(int), np.array([label_order.index(l) for l in labels]).astype(int))

0.6957424714434061

In [19]:
kmeans = KMeans(n_clusters=11, n_jobs=-1)
cluster = kmeans.fit_transform(res_fine)
purity_score(kmeans.labels_.astype(int), np.array([label_order.index(l) for l in labels]).astype(int))

0.731048805815161

In [119]:
inc_cluster.shape

(963, 50)

In [124]:
purity_score(kmeans.labels_.astype(int), np.array([label2id[l] for l in labels]).astype(int))

0.5503634475597092

In [34]:
from sklearn.metrics import accuracy_score
import numpy as np

def purity_score(y_true, y_pred):
    # matrix which will hold the majority-voted labels
    y_labeled_voted = np.zeros(y_true.shape)
    labels = np.unique(y_true)
    # We set the number of bins to be n_classes+2 so that 
    # we count the actual occurence of classes between two consecutive bin
    # the bigger being excluded [bin_i, bin_i+1[
    bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)

    for cluster in np.unique(y_pred):
        hist, _ = np.histogram(y_true[y_pred==cluster], bins=bins)
        # Find the most present label in the cluster
        winner = np.argmax(hist)
        y_labeled_voted[y_pred==cluster] = winner

    return accuracy_score(y_true, y_labeled_voted)

In [126]:
purity_score(kmeans.labels_.astype(int), np.array([label2id[l] for l in labels]).astype(int))

0.22741433021806853