In [81]:
import pandas as pd
import numpy as np
import pickle as pkl
from cnns.utils import clustering_utils as clu
from cnns.core import SOMClusterer as scl
from sklearn.metrics.cluster import homogeneity_completeness_v_measure
from cnns.core import FaceClustererFactory as fcf

In [3]:
%load_ext autoreload
%autoreload 2

In [91]:
def shuffle(df, n=1, axis=0):     
    df = df.copy()
    for _ in range(n):
        df.apply(np.random.shuffle, axis=axis)
    return df

def sample_faces(csv_filepath, p=0.1, shuffle_flag=True):
    df = pd.read_csv(csv_filepath)
    if shuffle_flag:
        df = shuffle(df)
    m = int(p*len(df))
    return df.ix[:m-1]

def get_sample_img_urls_X_by_tstamp(user_id, n=50, fv_type='openface'):
    base_dirpath = '/Users/babasarala/Desktop/face_clustering_full_data'
    url_csv_filepath = '%s/%i/%i_prod_fb_tags_img_urls_1.2_None.csv'%(base_dirpath, user_id, user_id)
    orig_csv_filepath = '%s/%i/%i_prod_fb_tags_1.2.csv'%(base_dirpath, user_id, user_id)
    cnn_codes_filepath = '/Users/babasarala/Desktop/face_clustering/%s_cnn_codes/%i_cnn_codes.p'\
    %(fv_type, user_id)
    
    img_urls, X = pkl.load(open(cnn_codes_filepath, 'rb'))
    orig_df = pd.read_csv(open(orig_csv_filepath, 'rb'))
    url_df = pd.read_csv(open(url_csv_filepath, 'rb'))
    url_with_tstamp_df = pd.merge(url_df, orig_df, on=['face_id'])
    url_with_tstamp_df = url_with_tstamp_df[['face_id', 'img_url', 'captured_at']]
    url_with_tstamp_df.drop_duplicates(inplace=True)
    feat_cols = ['f%i' % i for i in range(X.shape[1])]
    img_url_x_df = pd.DataFrame(data=X, columns=feat_cols)
    img_url_x_df['img_url'] = img_urls
    full_df = pd.merge(url_with_tstamp_df, img_url_x_df, on='img_url')
    sorted_df = full_df.sort('captured_at')
    n = min(n, len(sorted_df))
    sample_data = sorted_df.head(n)
    return list(sample_data['img_url'].values), sample_data[feat_cols].values

def run_clustering_on_samples(user_id, fv_type='openface', model='dbscan', version='1.0'):
    if fv_type == 'openface':
        fv_type_fmt = 'OpenFace'
    elif fv_type == 'vgg':
        fv_type_fmt = 'VGG'
    
    if model == 'dbscan':
        model_fmt = 'DBSCAN'
    elif model == 'agg':
        model_fmt = 'Agg'

    samp_img_urls, samp_X = get_sample_img_urls_X_by_tstamp(user_id)
    clusterer = fcf.FaceClustererFactory.from_config_file('/Users/babasarala/repos/cnns/config/%s_settings_v%s.ini'\
                                                          %(model, version))
    clusterer.load_data(samp_img_urls, samp_X)
    cluster_df = clusterer.run()
    cluster_map = clusterer.convert_to_maps(cluster_df)
    clu.visualize_clusters(cluster_map, '/Users/babasarala/Desktop/%s+%s_%s_samples/%i_clusters.html'\
                           %(fv_type_fmt, model_fmt, version, user_id))

In [92]:
fv_type = 'openface'
model = 'dbscan'
version = '1.1'
user_ids = [1946418, 8657185, 5626377, 5, 5692777, 3473194, 3928074, 4619758, 2685009, 1496616, 1341, 8, 34, 6007945]
for user_id in user_ids:
    try:
        run_clustering_on_samples(user_id, fv_type, model, version)
    except:
        print "uhh..%i didnt' work..."%(user_id)