# Initialization

In [3]:
import sys
sys.path.append("..")

In [6]:
## Imports
import os
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
sb.set(style="white", palette="muted")
import mydatasets
import utils
import math
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import ipywidgets as widgets
from ipywidgets import interact,fixed,interact_manual

In [7]:
DATASET="delicious"
SPLIT=0

In [8]:
print("Dataset :",DATASET,"/ Split :",SPLIT)

Dataset : delicious / Split : 0


In [9]:
curr_dir=os.getcwd()
os.chdir("..")
if DATASET in ["mediamill","delicious"]:
    full_dataset,trn_splits,tst_splits=mydatasets.load_small_dataset(DATASET)
    trn_data,tst_data=mydatasets.get_small_dataset_split(full_dataset,trn_splits,tst_splits,SPLIT)
else:
    trn_data,tst_data=mydatasets.load_large_dataset(DATASET)
os.chdir(curr_dir)
x_mat,y_mat=mydatasets.get_arrays(trn_data)
x_tst,y_tst=mydatasets.get_arrays(tst_data)

Loading datasets
../data/Delicious/Delicious_data.txt
../data/Delicious/delicious_trSplit.txt
../data/Delicious/delicious_tstSplit.txt
Number of splits : 10
## HEADER ##
#Point : 16105 , #Features : 500 , #Labels : 983


In [10]:
print("x_mat :",x_mat.shape)
print("y_mat :",y_mat.shape)
print("x_tst :",x_tst.shape)
print("y_tst :",y_tst.shape)

x_mat : (12920, 500)
y_mat : (12920, 983)
x_tst : (3185, 500)
y_tst : (3185, 983)


Let us reorder labels in order of increasing count

In [11]:
label_counts=np.sum(y_mat,axis=0)
reorder=np.argsort(label_counts)
y_mat=y_mat[:,reorder]

In [12]:
num_points=x_mat.shape[0]
num_features=x_mat.shape[1]
num_labels=y_mat.shape[1]
print("----Training set stats----")
print("Number of points :",num_points)
print("Feature dimensionality :",num_features)
print("Label dimensionality :",num_labels)

----Training set stats----
Number of points : 12920
Feature dimensionality : 500
Label dimensionality : 983


# Visualization of labels in X (TSNE)

In [13]:
PCA_COMPONENTS=50

In [14]:
pca=PCA(n_components=PCA_COMPONENTS)
x_red=pca.fit_transform(x_mat)
x_tsne=TSNE(n_components=2,n_jobs=-1,metric="cosine").fit_transform(x_red)

In [16]:
def plot_labels_with_x_background(x_tsne,y_mat,lab_idx):
    plt.figure(figsize=(10,10))
    sizes=[3 for j in range(0,x_tsne.shape[0])]
    plt.scatter(x_tsne[:,0],x_tsne[:,1],color="#eda6ae",s=sizes)
    supp_bool=y_mat[:,lab_idx]==1
    lab_points=x_tsne[supp_bool,:]
    plt.scatter(lab_points[:,0],lab_points[:,1],color="#43464a",marker="o")
    num_lab_points=np.sum(supp_bool)
    percent=np.round(100*num_lab_points/y_mat.shape[0],decimals=2)
    print("Number of points in label = ",num_lab_points,",",percent,"%")

In [17]:
label_selector=widgets.IntSlider(
    value=0,
    min=0,
    max=num_labels-1,
    step=1,
    description='Label Rank:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

In [18]:
_=interact_manual(plot_labels_with_x_background,x_tsne=fixed(x_tsne),y_mat=fixed(y_mat),lab_idx=label_selector)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Label Rank:', max=982), Button(…

# Visualization of labels in Y (TSNE)

First let's filter out duplicate labelsets

In [19]:
y_unique,inv,uncounts=np.unique(y_mat,axis=0,return_inverse=True,return_counts=True)

In [20]:
PCA_COMPONENTS=50

In [21]:
pca=PCA(n_components=PCA_COMPONENTS)
y_red=pca.fit_transform(y_unique)
y_tsne=TSNE(n_components=2,n_jobs=-1).fit_transform(y_red)

In [22]:
def plot_labels_with_y_background(y_tsne,y_unique,lab_idx):
    plt.figure(figsize=(10,10))
    sizes=[3 for j in range(0,y_tsne.shape[0])]
    plt.scatter(y_tsne[:,0],y_tsne[:,1],color="#eda6ae",s=sizes)
    supp_bool=y_unique[:,lab_idx]==1
    lab_points=y_tsne[supp_bool,:]
    plt.scatter(lab_points[:,0],lab_points[:,1],color="#43464a",marker="o")
    num_lab_points=np.sum(supp_bool)
    percent=np.round(100*num_lab_points/y_unique.shape[0],decimals=2)
    print("Number of unique labelsets in label = ",num_lab_points,",",percent,"%")

In [23]:
label_selector_2=widgets.IntSlider(
    value=0,
    min=0,
    max=num_labels-1,
    step=1,
    description='Label Rank:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

In [24]:
_=interact_manual(plot_labels_with_y_background,y_tsne=fixed(y_tsne),y_unique=fixed(y_unique),lab_idx=label_selector_2)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Label Rank:', max=982), Button(…

# Visualization of labelsets in X (TSNE)

In [25]:
NUM_CLUSTERS=10

In [26]:
kmeans=KMeans(n_clusters=NUM_CLUSTERS,n_jobs=-1,precompute_distances=True).fit(y_unique)
cluster_labels=kmeans.predict(y_mat)

In [27]:
def plot_labelset_clusters_with_x_background(x_tsne,cluster_labels,cluster_idx):
    plt.figure(figsize=(10,10))
    sizes=[3 for j in range(0,x_tsne.shape[0])]
    plt.scatter(x_tsne[:,0],x_tsne[:,1],color="#eda6ae",s=sizes)
    supp_bool=cluster_labels==cluster_idx
    lab_points=x_tsne[supp_bool,:]
    plt.scatter(lab_points[:,0],lab_points[:,1],color="#43464a",marker="o")
    num_lab_points=np.sum(supp_bool)
    percent=np.round(100*num_lab_points/x_tsne.shape[0],decimals=2)
    print("Number of points in labelset cluster = ",num_lab_points,",",percent,"%")

In [28]:
label_selector_3=widgets.IntSlider(
    value=0,
    min=0,
    max=NUM_CLUSTERS-1,
    step=1,
    description='Label Rank:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

In [29]:
_=interact_manual(plot_labelset_clusters_with_x_background,x_tsne=fixed(x_tsne),cluster_labels=fixed(cluster_labels),cluster_idx=label_selector_3)

interactive(children=(IntSlider(value=0, continuous_update=False, description='Label Rank:', max=9), Button(de…