In [2]:
import pandas as pd
import numpy as np
import pickle as pkl
from cnns.utils import clustering_utils as clu
from cnns.core import SOMClusterer as scl
from sklearn.metrics.cluster import homogeneity_completeness_v_measure

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
def shuffle(df, n=1, axis=0):     
    df = df.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

def sample_faces(csv_filepath, p=0.1, shuffle_flag=True):
    df = pd.read_csv(csv_filepath)
    if shuffle_flag:
        df = shuffle(df)
    m = int(p*len(df))
    return df.ix[:m-1]


In [5]:
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34, 6007945]

In [122]:
user_id = 2685009
cluster_settings_ver = 1.2
comp_type = 'None'
base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'

url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_%.1f_%s.csv'%(base_dirpath, user_id, user_id, cluster_settings_ver,
                                                               comp_type)
orig_csv_filepath = '%s/%i/%i_prod_fb_tags_%.1f.csv'%(base_dirpath, user_id, user_id, cluster_settings_ver)
url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
orig_df = pd.read_csv(open(orig_csv_filepath, 'rb'))
url_with_tstamp_df = pd.merge(url_df, orig_df, on=['face_id'])
url_with_tstamp_df = url_with_tstamp_df[['face_id','img_url', 'captured_at']]
url_with_tstamp_df.drop_duplicates(inplace=True)

pkl_filepath = '%s/%i/%i_prod_1.2_None_cnn_codes.p'%(base_dirpath, user_id, user_id)
img_urls, X = pkl.load(open(pkl_filepath, 'rb'))
feat_cols = ['f%i' % i for i in range(X.shape[1])]
img_url_x_df = pd.DataFrame(data=X, columns=feat_cols)
img_url_x_df['img_url'] = img_urls
full_df = pd.merge(url_with_tstamp_df, img_url_x_df, on='img_url')

full_df.sort()

In [27]:
def compute_hcv_metrics(user_id, vis_tag='hyperopt_sample', grid_size=10, sigma=3.576, 
                        learning_rate=0.0005064, num_trials=3,num_iter=478, reduce_dim=True,
                        num_dim=1024, shuffle_type='random'):
    #print 'Processing metrics of user %i'%(user_id)
    # keep these hard-coded for the time being...
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    cluster_settings_ver = 1.2
    comp_type = 'None'
    tagged_csv_filepath = '%s/%i_clusters.csv'%(base_dirpath, user_id)
    tagged_df = pd.read_csv(open(tagged_csv_filepath, 'rb'))

    # CSV filepaths
    url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_%.1f_%s.csv'\
                       %(base_dirpath, user_id, user_id, cluster_settings_ver,comp_type)
    
    # create dataframes
    url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
    pkl_filepath = '%s/%i/%i_prod_1.2_None_cnn_codes.p'%(base_dirpath, user_id, user_id)
    img_urls, X = pkl.load(open(pkl_filepath, 'rb'))
    np.random.seed(0)
    if shuffle_type == 'random':
        n = len(img_urls)
        sample = 0.1
        m = int(n*sample)
        sample_idxs = np.random.permutation(n)[:m]
        img_urls = [img_urls[idx] for idx in sample_idxs]
        X = X[sample_idxs]
        print X[:5, :5]
        #print 'Number of photos: %i'%len(img_urls)
    elif shuffle_type == 'tstamp':
        # combine with a few other csv files to get timestamps, sort by that field and then sample (this is closer
        # to reality)
        orig_csv_filepath = '%s/%i/%i_prod_fb_tags_%.1f.csv'%(base_dirpath, user_id, user_id, cluster_settings_ver)
        orig_df = pd.read_csv(open(orig_csv_filepath, 'rb'))
        url_with_tstamp_df = pd.merge(url_df, orig_df, on=['face_id'])
        url_with_tstamp_df = url_with_tstamp_df[['face_id', 'img_url', 'captured_at']]
        url_with_tstamp_df.drop_duplicates(inplace=True)
        feat_cols = ['f%i' % i for i in range(X.shape[1])]
        img_url_x_df = pd.DataFrame(data=X, columns=feat_cols)
        img_url_x_df['img_url'] = img_urls
        full_df = pd.merge(url_with_tstamp_df, img_url_x_df, on='img_url')
        
        # UNFINISHED!!!!

    scl_ = scl.SOMClusterer(img_urls, X, grid_size=grid_size, sigma=sigma, learning_rate=learning_rate, 
                            num_trials=num_trials, num_iter=num_iter)
    cluster_df = scl_.run()

    gt_df = pd.merge(cluster_df, url_df, left_on='face_url', right_on='img_url')
    cols = ['face_id', 'tag', 'memorable_id', 'cluster_idx', 'face_url']

    merged_df = pd.merge(gt_df, tagged_df, on=['face_id', 'memorable_id', 'user_id'])[cols]
    true_labels = list(merged_df['tag'].values)
    cluster_labels = list(merged_df['cluster_idx'].values)
    h, c, v = homogeneity_completeness_v_measure(true_labels, cluster_labels)

    # visualize!
    cluster_map = clu.get_map_from_dataframe(cluster_df, 'face_url', 'cluster_idx')
    clu.visualize_clusters(cluster_map, '%s/%i/cluster_htmls/%i_%s_visualization.html'%(base_dirpath, 
                                                                                        user_id,
                                                                                        user_id,
                                                                                        vis_tag))
    print len(img_urls), h, c, v

In [28]:
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34, 6007945]
user_ids = [1946418]
for user_id in user_ids:
    compute_hcv_metrics(user_id, vis_tag='benchmark_10_perc', grid_size=15, sigma=1.0, learning_rate=0.25, 
                        num_trials=5, num_iter=100, reduce_dim=False, shuffle_type='random')
    

[[ -3.87861347  -0.77776706 -16.28775215  -0.61007917  -0.40619224]
 [  0.8169356   -0.66338706  -6.88083935  -0.73424685  -0.33310664]
 [ -0.29154292  -0.7365123    2.05267668  -0.95801759  -0.51999474]
 [  1.12536311  -0.82720637 -17.75845909  -1.1278187   -0.56964034]
 [ -0.50154841  -0.72430342   2.59770107  -0.69903916  -0.3793076 ]]
44 1.0 0.611993219081 0.759299991882


In [25]:
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34, 6007945]
user_ids = [1946418]
for user_id in user_ids:
    compute_hcv_metrics(user_id, vis_tag='hyperopt_10_perc', grid_size=10, sigma=3.576, learning_rate=0.0005064, 
                        num_trials=3, num_iter=478, reduce_dim=False, shuffle_type='random')
    

44 0.591537028916 0.834434397825 0.692298366234
193 0.288302480055 0.421683542297 0.342464237929
121 0.430053332688 0.551833314035 0.483391349869
184 0.106227369508 0.504960393925 0.175529084716
15 0.662233978601 0.881794486449 0.756403504583
106 0.363053591953 0.378849133993 0.370783214819
252 0.264690941954 0.406264526222 0.320541512004
4 1.0 1.0 1.0
78 0.298862367847 0.485630658307 0.37001406922
252 0.207056197824 0.339957105245 0.2573620248
718 0.476334614458 0.337790283511 0.395274004923
388 0.34837558185 0.538055293851 0.42292147352
43 0.59564306101 0.860225440967 0.703892300878
71 0.487927998684 0.631386501892 0.550463970771


Playing with clustering metrics

In [55]:
print true_labels
print cluster_labels

['Hayden', 'Fai Alqadi', 'Diala Alqadi', 'Manu', 'Ammar', 'Justin Williams', 'Fai Alqadi', 'Fai Alqadi', 'Fai Alqadi', 'Manu', 'Baba', 'Baba', 'Diala Alqadi', 'Fai Alqadi', 'Manu', 'Heather', 'Diala Alqadi', 'Fai Alqadi', 'johnny', 'Talal ', 'Fai Alqadi', 'Kylie Mckenzie', 'Diala Alqadi', 'Fai Alqadi', 'akram', 'akram', 'Lhazin ', 'Fai Alqadi', 'Diala Alqadi', 'Talal ', 'Fai Alqadi', 'Lhazin ', 'Lhazin ', 'Kylie Mckenzie', 'Fai Alqadi', 'Hanna Kirby', 'Fai Alqadi', 'Fai Alqadi', 'Diala Alqadi', 'Diala Alqadi', 'rey', 'Amy', 'jake', 'Fai Alqadi', 'Fai Alqadi', 'Lina', 'Fai Alqadi', 'Kylie Mckenzie', 'Hanna Kirby', 'soheil', 'Kylie Mckenzie', 'Fai Alqadi']
[0.0, 1.0, 2.0, 4.0, 5.0, 7.0, 8.0, 9.0, 9.0, 10.0, 12.0, 12.0, 16.0, 18.0, 19.0, 21.0, 24.0, 26.0, 28.0, 29.0, 30.0, 31.0, 34.0, 35.0, 36.0, 36.0, 38.0, 41.0, 2.0, 43.0, 44.0, 46.0, 47.0, 49.0, 51.0, 65.0, 66.0, 55.0, 68.0, 69.0, 70.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 60.0, 62.0, 63.0, 64.0]


In [60]:
cluster_map, _, _ = scl_.convert_to_maps(cluster_df)
clu.visualize_clusters(cluster_map, '%s/%i/cluster_htmls/%i_%s_sample_visualization.html'%(base_dirpath, 
                                                                                           user_id,
                                                                                           user_id,
                                                                                           vis_tag))

In [59]:
h, c, v = homogeneity_completeness_v_measure(true_labels, cluster_labels)
print h, c, v

1.0 0.634637661925 0.776487262844


In [47]:
pkl_filepath = '%s/%i/%i_prod_1.2_None_cnn_codes.p'%(base_dirpath, user_id, user_id)
img_urls, X = pkl.load(open(pkl_filepath, 'rb'))
n = len(img_urls)
sample = 0.1
m = int(n*sample)
sample_idxs = np.random.permutation(n)[:m]
sample_img_urls = [img_urls[idx] for idx in sample_idxs]
X_samp = X[sample_idxs]

44

In [38]:
df = pd.DataFrame({'img_url': img_urls})

Unnamed: 0,img_url
0,/Users/babasarala/Desktop/face_clustering_full...
1,/Users/babasarala/Desktop/face_clustering_full...
2,/Users/babasarala/Desktop/face_clustering_full...
3,/Users/babasarala/Desktop/face_clustering_full...
4,/Users/babasarala/Desktop/face_clustering_full...
5,/Users/babasarala/Desktop/face_clustering_full...
6,/Users/babasarala/Desktop/face_clustering_full...
7,/Users/babasarala/Desktop/face_clustering_full...
8,/Users/babasarala/Desktop/face_clustering_full...
9,/Users/babasarala/Desktop/face_clustering_full...


In [16]:
base_dir = '/Users/babasarala/Desktop/face_clustering_full_data'
for user_id in user_ids:
    csv_filepath = '%s/%i_clusters.csv'%(base_dir, user_id)
    csv_savepath = '%s/%i_sample_clusters.csv'%(base_dir, user_id)
    sampled_df = sample_faces(csv_filepath)
    sampled_df.to_csv(csv_savepath)