In [1]:
from cnns.utils import clustering_utils as clu
from cnns.utils import cnn_utils as cu
from cnns.core import SOMClusterer as scl
from cnns.core import APClusterer as ap
from sklearn.metrics.cluster import homogeneity_completeness_v_measure
import pandas as pd
import pickle as pkl
import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2

In [22]:
# computes h, c, v given a single set of parameters for multi-som
def compute_hcv_metrics(user_id, vis_tag='benchmark', grid_size=15, sigma=1.0, 
                        learning_rate=0.25, num_trials=5, num_iter=100, 
                        reduce_dim=False, num_dim=512):
    
    # keep these hard-coded for the time being...
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    cluster_settings_ver = 1.2
    comp_type = 'None'
    tagged_csv_filepath = '%s/%i_clusters.csv'%(base_dirpath, user_id)
    tagged_df = pd.read_csv(open(tagged_csv_filepath, 'rb'))
    
    # combine with the CSV file
    url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_%.1f_%s.csv'\
                       %(base_dirpath, user_id, user_id, cluster_settings_ver,comp_type)
    url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
    
    pkl_filepath = '%s/%i/%i_prod_1.2_None_cnn_codes.p'%(base_dirpath, user_id, user_id)
    img_urls, X = pkl.load(open(pkl_filepath, 'rb'))
    scl_ = scl.SOMClusterer(img_urls, X, grid_size=grid_size, sigma=sigma, learning_rate=learning_rate, 
                            num_trials=num_trials, reduce_dim=reduce_dim, num_dim=num_dim, num_iter=num_iter)
    cluster_df = scl_.run()
    
    gt_df = pd.merge(cluster_df, url_df, left_on='face_url', right_on='img_url')
    cols = ['face_id', 'tag', 'memorable_id', 'cluster_idx', 'face_url']

    merged_df = pd.merge(gt_df, tagged_df, on=['face_id', 'memorable_id', 'user_id'])[cols]
    true_labels = list(merged_df['tag'].values)
    cluster_labels = list(merged_df['cluster_idx'].values)
    h, c, v = homogeneity_completeness_v_measure(true_labels, cluster_labels)
    
    # visualize!
    #cluster_map, _, _ = scl_.convert_to_maps(cluster_df)
    #clu.visualize_clusters(cluster_map, '%s/%i/cluster_htmls/%i_%s_visualization.html'%(base_dirpath, 
    #                                                                                    user_id,
    #                                                                                    user_id,
    #                                                                                    vis_tag))
    return h, c, v

In [38]:
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34, 6007945]
for user_id in user_ids:
    print compute_hcv_metrics(user_id, vis_tag='hyperopt',grid_size=8, sigma=2.39, 
                        learning_rate=0.0002, num_trials=13, num_iter=366, 
                        reduce_dim=False, num_dim=512)

(0.94467167761099868, 0.62587234451776363, 0.75291602060862328)
(0.91164197472455044, 0.51806201456998691, 0.66067812852002117)
(0.95525492998131911, 0.4696769464768778, 0.62973006083041616)
(0.94488535690170561, 0.57847139005902215, 0.71761148128152297)
(0.96386913518529493, 0.49705294897486607, 0.65587891546607846)
(0.91953177353443116, 0.3297790670232067, 0.48545537352261375)
(0.93353778957693501, 0.48513913269593822, 0.63847618363827308)
(0.950062362614707, 0.62928898957117096, 0.75710041767709746)
(0.9679689081541154, 0.540678759068148, 0.69381372397033303)
(0.84222229416584626, 0.36225281586072056, 0.50660639659963491)
(0.95407190857232782, 0.43568679836638086, 0.59819957692197023)
(0.94392129818968118, 0.55497531289178093, 0.69898485850878211)
(0.95111001747459334, 0.66656405547588982, 0.78381147482360047)
(0.90468830983014625, 0.67594288002357661, 0.77376383003911597)


In [29]:
# computes h, c, v given a single set of parameters for affinity propagation
def compute_hcv_metrics_ap(user_id, damping=0.5, convergence_iter=15, max_iter=200, affinity='euclidean', 
                           reduce_dim=False, num_dim=512):
    
    # keep these hard-coded for the time being...
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    cluster_settings_ver = 1.2
    comp_type = 'None'
    tagged_csv_filepath = '%s/%i_clusters.csv'%(base_dirpath, user_id)
    tagged_df = pd.read_csv(open(tagged_csv_filepath, 'rb'))
    
    # combine with the CSV file
    url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_%.1f_%s.csv'\
                       %(base_dirpath, user_id, user_id, cluster_settings_ver,comp_type)
    url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
    
    pkl_filepath = '%s/%i/%i_prod_1.2_None_cnn_codes.p'%(base_dirpath, user_id, user_id)
    img_urls, X = pkl.load(open(pkl_filepath, 'rb'))
    ap_ = ap.APClusterer(img_urls, X, damping=damping, convergence_iter=convergence_iter, 
                         max_iter=max_iter, affinity=affinity)
    cluster_df = ap_.run()
    
    gt_df = pd.merge(cluster_df, url_df, left_on='face_url', right_on='img_url')
    cols = ['face_id', 'tag', 'memorable_id', 'cluster_idx', 'face_url']

    merged_df = pd.merge(gt_df, tagged_df, on=['face_id', 'memorable_id', 'user_id'])[cols]
    true_labels = list(merged_df['tag'].values)
    cluster_labels = list(merged_df['cluster_idx'].values)
    h, c, v = homogeneity_completeness_v_measure(true_labels, cluster_labels)
    
    # visualize!
    cluster_map, _, _ = ap_.convert_to_maps(cluster_df)
    clu.visualize_clusters(cluster_map, '%s/%i/cluster_htmls/%i_ap_benchmark_visualization.html'%(base_dirpath, 
                                                                                                  user_id, 
                                                                                                  user_id))
    return h, c, v

In [30]:
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34, 6007945]
for user_id in user_ids:
    print compute_hcv_metrics_ap(user_id, reduce_dim=False)

(0.81691319967933962, 0.64167025864308913, 0.71876436159516643)
(0.86055756673398431, 0.5170361700768773, 0.64596604433590632)
(0.87644561697223256, 0.47876248035198909, 0.61925438359429696)
(0.89696463651039604, 0.58480595628621734, 0.70800468650059223)
(0.64932365211847387, 0.55611640291205233, 0.59911653380839103)
(0.915168703284297, 0.34337817485148342, 0.49938379646285597)
(0.93365237009987734, 0.48579180580132514, 0.63906799374275036)
(0.83236176952702645, 0.99999999999999989, 0.90851248194495748)
(0.83387808223614646, 0.549254278999731, 0.66228094673895699)
(0.92002315855883798, 0.35599702298575048, 0.51335474197348907)
(0.9417033073552824, 0.32330699834520549, 0.48135460756455622)
(0.91600018566019303, 0.52515532224526262, 0.66757871727005458)
(0.97204690973654628, 0.71366727676827602, 0.82305538686965873)
(0.91801120244806245, 0.64788726221480875, 0.75965048572245686)


In [18]:
import pandas as pd
for user_id in user_ids:
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    tagged_csv_filepath = '%s/%i_clusters.csv'%(base_dirpath, user_id)
    df = pd.read_csv(open(tagged_csv_filepath, 'rb'))
    print len(df['tag'].unique())

51
110
66
134
33
50
97
13
58
85
108
120
12
35


Variance for repeated trials

In [102]:
user_id = 5626377
H = np.empty((2, 10))
C = np.empty((2, 10))
V = np.empty((2, 10))

for idx, num_iter in enumerate([500]):
    for i in range(10):
        h, c, v = compute_hcv_metrics(user_id, num_iter=num_iter)
        H[idx, i] = h
        C[idx, i] = c
        V[idx, i] = v

In [103]:
H[0]

array([ 0.95675248,  0.97949008,  0.9713768 ,  0.97129576,  0.96846991,
        0.96413817,  0.97463498,  0.96487064,  0.9724939 ,  0.9733207 ])

In [104]:
print 'Homogeneity mean, std'
print np.mean(H, axis=1)
print np.std(H, axis=1)

print 'Completeness mean, std'
print np.mean(C, axis=1)
print np.std(C, axis=1)

print 'V-Measure mean, std'
print np.mean(V, axis=1)
print np.std(V, axis=1)


Homogeneity mean, std
[ 0.96968434  0.        ]
[ 0.00608177  0.        ]
Completeness mean, std
[ 0.42857745  0.        ]
[ 0.00670511  0.        ]
V-Measure mean, std
[  5.94392086e-001   4.94065646e-324]
[ 0.00649097  0.        ]


In [97]:
user_id = 5626377
num_iters = [10, 50, 100, 500, 1000]
hs = []
cs = []
vs = []
for num_iter in num_iters:
    print 'Currently processing with number of iterations: %i'%(num_iter)
    h, c, v = compute_hcv_metrics(user_id, num_iter=num_iter)
    hs.append(h)
    cs.append(c)
    vs.append(v)

Currently processing with number of iterations: 10
Currently processing with number of iterations: 50
Currently processing with number of iterations: 100
Currently processing with number of iterations: 500
Currently processing with number of iterations: 1000


In [101]:
cs

[0.42340902098957089,
 0.4326177007735828,
 0.41429745914816557,
 0.43944550881313643,
 0.43179167359057719]

In [30]:
num_dims = [128, 256, 512, 1024, 2048]
user_id = 1496616
for num_dim in num_dims:
    print 'Currently processing with number of dimensions: %i'%(num_dim)
    h, c, v = compute_hcv_metrics(user_id, reduce_dim=True, num_dim=num_dim)
    print h, c, v
h, c, v = compute_hcv_metrics(user_id)
print 'Currently processing with number of dimensions: 4096'
print h, c, v

Currently processing with number of dimensions: 128
(2520, 128)
-3.32997156363e-16 1.0 -6.65994312725e-16
Currently processing with number of dimensions: 256
(2520, 256)
0.0779097786969 0.199353388764 0.11203492006
Currently processing with number of dimensions: 512
(2520, 512)
0.673383748645 0.856998259316 0.754176012837
Currently processing with number of dimensions: 1024
(2520, 1024)
0.768963574699 0.740566930956 0.754498160055
Currently processing with number of dimensions: 2048
(2520, 2048)
0.7901482358 0.486010644583 0.601837998858
0.850587368382 0.422252389098 0.564348412791


In [None]:
# compute H-C-V metrics
def compute_hcv_metrics(tagged_csv_filepath, cluster_df, , vis_tag='benchmark', grid_size=15, sigma=1.0, 
                        learning_rate=0.25, num_trials=5, num_iter=100, 
                        reduce_dim=False, num_dim=512):
    
    # keep these hard-coded for the time being...
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    cluster_settings_ver = 1.2
    comp_type = 'None'
    tagged_csv_filepath = '%s/%i_clusters.csv'%(base_dirpath, user_id)
    tagged_df = pd.read_csv(open(tagged_csv_filepath, 'rb'))
    
    # combine with the CSV file - we need this
    url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_%.1f_%s.csv'\
                       %(base_dirpath, user_id, user_id, cluster_settings_ver,comp_type)
    url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
    
    pkl_filepath = '%s/%i/%i_prod_1.2_None_cnn_codes.p'%(base_dirpath, user_id, user_id)
    img_urls, X = pkl.load(open(pkl_filepath, 'rb'))
    scl_ = scl.SOMClusterer(img_urls, X, grid_size=grid_size, sigma=sigma, learning_rate=learning_rate, 
                            num_trials=num_trials, reduce_dim=reduce_dim, num_dim=num_dim, num_iter=num_iter)
    cluster_df = scl_.run()
    
    gt_df = pd.merge(cluster_df, url_df, left_on='face_url', right_on='img_url')
    cols = ['face_id', 'tag', 'memorable_id', 'cluster_idx', 'face_url']

    merged_df = pd.merge(gt_df, tagged_df, on=['face_id', 'memorable_id', 'user_id'])[cols]
    true_labels = list(merged_df['tag'].values)
    cluster_labels = list(merged_df['cluster_idx'].values)
    h, c, v = homogeneity_completeness_v_measure(true_labels, cluster_labels)
    
    # visualize!
    #cluster_map, _, _ = scl_.convert_to_maps(cluster_df)
    #clu.visualize_clusters(cluster_map, '%s/%i/cluster_htmls/%i_%s_visualization.html'%(base_dirpath, 
    #                                                                                    user_id,
    #                                                                                    user_id,
    #                                                                                    vis_tag))
    return h, c, v