The goal of this notebook is to see how h,c,v metrics change with more data

In [4]:
from cnns.utils import clustering_utils as clu
from cnns.utils import cnn_utils as cu
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import fcluster
from scipy.spatial.distance import cdist, pdist, squareform
from sklearn.metrics.cluster import homogeneity_completeness_v_measure
from cnns.core import FaceClusterer as face_clusterer
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import pickle as pkl
import numpy as np
import fastcluster as fc

In [None]:
%load_ext autoreload
%autoreload 2

In [6]:
%matplotlib inline
plt.style.use('bmh')
matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

In [9]:
feat_cols = ['f%i' % i for i in range(128)]

def get_tagged_data_sorted_by_tstamp(user_id):
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_1.2_None.csv'%(base_dirpath, user_id, user_id)
    orig_csv_filepath = '%s/%i/%i_prod_fb_tags_1.2.csv'%(base_dirpath, user_id, user_id)
    cnn_codes_filepath = '/Users/babasarala/Desktop/face_clustering/openface_cnn_codes/%i_cnn_codes.p'%(user_id)
    ground_truth_filepath = '/Users/babasarala/Desktop/face_clustering_full_data/%i_clusters.csv'%(user_id)

    # load csv file with local image url info and join together to get timestamps
    url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
    orig_df = pd.read_csv(open(orig_csv_filepath, 'rb'))
    url_with_tstamp_df = pd.merge(url_df, orig_df, on=['face_id'])
    url_with_tstamp_df = url_with_tstamp_df[['face_id', 'img_url', 'captured_at']]
    url_with_tstamp_df.drop_duplicates(inplace=True)

    # join with tagged data
    tagged_df = pd.read_csv(open(ground_truth_filepath, 'rb'))
    gt_df = pd.merge(url_with_tstamp_df, tagged_df, on='face_id')
    
    img_urls, X = pkl.load(open(cnn_codes_filepath, 'rb'))
    img_url_x_df = pd.DataFrame(data=X, columns=feat_cols)
    img_url_x_df['img_url'] = img_urls
    
    full_df = pd.merge(gt_df, img_url_x_df, on='img_url')
    sorted_df = full_df.sort('captured_at')
    
    return sorted_df

def update_cluster_tags(C, df, user_tagging_thresh):
    # this function takes a cluster, looks at what's already been tagged and 
    # uses the following policy to assign new labels.
    #
    # 1. If the new cluster has no tags, but is more than user_tagging_thresh percent
    #    homogeneous, decide to tag that majority (leaving the rest unconfirmed)
    # 
    # 2. If the cluster's homogeneity is less than user_tagging_thresh, leave it alone
    #    and wait for future clusterings
    #
    # 3. If the cluster has any pre-filled tags and there isn't a conflict, 
    #    propagate the label. Assume that the user does NOT check the result of this
    
    num_confirms = 0
    for _, samples in C.iteritems():
        
        # look at what's already been tagged
        curr_user_tags = df.iloc[samples]['inc_tag'].value_counts()
        
        # if there is no tag at all for this cluster...
        if len(curr_user_tags) == 0:
            true_user_tags = df.iloc[samples]['tag'].value_counts()
            p = 1.*true_user_tags[0]/np.mean(true_user_tags)
            # ... simulate tagging under the conditions. if the cluster is not homogenous enough 
            # (i.e. p < user_tagging_thresh), assume that the user skips it
            if p >= user_tagging_thresh:
                # otherwise, assume that the dominant tag propagates (the rest are left "unconfirmed")
                num_confirms += 1
                s = df.iloc[samples]['tag'] == true_user_tags.index[0]
                majority_idxs = s[s==True].index
                df.loc[majority_idxs, 'inc_tag'] = true_user_tags.index[0]
            continue
        
        # dict of k->v: tag-> number of tag in cluster
        tag_to_nums = {tag:num_c for tag, num_c in zip(curr_user_tags.index.tolist(), curr_user_tags.tolist())}
        if len(tag_to_nums) > 1: # points to inconsistency
            continue
        else:
            # propagate the label
            df.loc[df.iloc[samples].index, 'inc_tag'] = curr_user_tags.index[0]
            
    return num_confirms

In [64]:
# Different clustering schemes
def perform_batch_clustering(user_id, dist_thresh=0.5, user_tagging_thresh=0.75, save_htmls=False, min_size=1):
    # How do the results compare if we did the clustering all at once?
    df = get_tagged_data_sorted_by_tstamp(user_id)
    df['inc_tag'] = None 
    X = df[feat_cols].values
    Z = fc.linkage(X, 'average')
    cluster_ids = fcluster(Z, dist_thresh, criterion='distance')
    C = defaultdict(list)
    for idx, cluster_id in enumerate(cluster_ids):
        C[cluster_id].append(idx)
    
    C = {cluster_id:data_samples for cluster_id, data_samples in C.iteritems() if len(data_samples) > min_size}
    
    num_confirms = update_cluster_tags(C, df, user_tagging_thresh)

    # 3. Write out html files ()
    #if save_htmls:
    #    cluster_map = {img_url:cluster_id for img_url, cluster_id in \
    #                   zip(df['img_url'].values, df['inc_tag'].values)}
    #    clu.visualize_clusters(cluster_map, curr_html_filepath)

    return df, num_confirms

def perform_incremental_clustering(user_id, dist_thresh=0.5, batch_size=50, user_thresh=0.75, 
                                   save_htmls=False, min_size=1):
    df = get_tagged_data_sorted_by_tstamp(user_id)
    df['inc_tag'] = None # incremental tags - start everything off as "None"
    user_tagging_thresh = 0.75 # simulate a user threshold for tagging vs skipping
    N = len(df)
    num_confirms_per_session = []

    idxs = range(0, N, batch_size) + [N]
    for n in idxs[1:]:
        curr_html_filepath = '/Users/babasarala/Desktop/face_clustering/incremental/%i_%i_%.2f_%i.html'% \
        (user_id, n, dist_thresh, min_size)
        X = df.iloc[:n][feat_cols].values

        # 1. Cluster the fvs
        Z = fc.linkage(X, 'average')
        cluster_ids = fcluster(Z, dist_thresh, criterion='distance')

        C = defaultdict(list)
        for idx, cluster_id in enumerate(cluster_ids):
            C[cluster_id].append(idx)
        
        # 2. Eliminate clusters that are < min_size - this should reduce the number of "confirmations"
        #    that we'll require of the user
        C = {cluster_id:data_samples for cluster_id, data_samples in C.iteritems() if len(data_samples) > min_size}

        # 2. Update cluster
        num_confirms = update_cluster_tags(C, df, user_tagging_thresh)
        num_confirms_per_session.append(num_confirms)

        # 3. Write out html files ()
        if save_htmls:
            cluster_map = {img_url:cluster_id for img_url, cluster_id in \
                           zip(df.iloc[:n]['img_url'].values, df.iloc[:n]['inc_tag'].values)}
            clu.visualize_clusters(cluster_map, curr_html_filepath)

    return df, np.mean(num_confirms_per_session)


def print_metrics(df, with_annot=False):
    true_labels = df['tag'].values
    pred_labels = df['inc_tag'].values
    
    num_omitted = np.sum([pred_label is None for pred_label in pred_labels])
    perc_omitted = 100.*num_omitted/len(df)
    acc = 100.*np.mean([true_label == pred_label for true_label, pred_label in zip(true_labels, pred_labels) \
                        if pred_label is not None])
    
    if with_annot:
        print 'Accuracy: %.2f%%, Percent of faces omitted: %.2f%% (%i/%i), Mean number of confirms required: %i'\
        %(acc, perc_omitted, num_omitted, len(df), mean_num_confirms)
    else:
        print acc, perc_omitted, mean_num_confirms
    

# Single account test

In [65]:
user_id = 5626377
dist_thresh = 0.5
df, mean_num_confirms = perform_batch_clustering(user_id, dist_thresh=dist_thresh, save_htmls=True)
print_metrics(df, with_annot=True)

Accuracy: 100.00%, Percent of faces omitted: 31.33% (303/967), Mean number of confirms required: 206


In [274]:
user_id = 5626377
dist_threshs = np.arange(0.4, 0.85, 0.05)
for dist_thresh in dist_threshs:
    acc, num_omitted, mean_num_confirms = perform_incremental_clustering(user_id, 
                                                                         dist_thresh=dist_thresh)
    print acc, num_omitted, mean_num_confirms

99.1726990693 0 34.5
98.2419855222 0 29.0
95.0361944157 9 23.3
91.8304033092 19 18.45
88.728024819 34 14.2
81.7993795243 77 11.35
72.5956566701 168 8.45
70.423991727 180 6.8
58.6349534643 262 5.2


In [51]:
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34]

In [53]:
len(user_ids)

14

In [52]:
dist_thresh = 0.5
for user_id in user_ids:
    df, mean_num_confirms = perform_incremental_clustering(user_id, dist_thresh=dist_thresh, save_htmls=True)
    print_metrics(df, with_annot=False)

96.6183574879 33.2258064516 8.0
95.4861111111 33.9449541284 7.55555555556
94.1690962099 29.0589451913 8.95
99.148029819 31.3094367228 7.92857142857
95.0 61.5384615385 5.0
100.0 32.8244274809 10.1875
98.8458927359 21.4818763326 6.39473684211
100.0 63.4146341463 7.0
99.1701244813 52.652259332 7.0
96.3870967742 14.2224681793 3.91891891892
99.231678487 25.165855816 7.5
97.8502080444 26.8020304569 8.9
100.0 20.6451612903 5.75


UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 6: ordinal not in range(128)

In [None]:
user_id = 5626377
plt.plot(range(1,len(num_tags_per_session)+1), num_tags_per_session, label='T = %.2f'%(dist_thresh))
plt.xlabel('Session'); plt.ylabel('Number of user taggings')
plt.title('Number of user taggings per session, Acc = %.2f%%, Num. omissions = %i'%(acc, num_omitted))
plt.legend(loc='upper right')
plt.show()