In [1]:
%load_ext autoreload
%autoreload 2

from repo import *
import numpy as np
import seaborn as sns
from sklearn.metrics import roc_curve, auc

In [2]:
# get the file names
train_files, test_files = train_test_all_ids('/data/home/shruti/voxceleb/vgg/leaders/')
all_lbls = list(train_files.keys())
all_lbls.sort()
all_lbls = np.array(all_lbls)

In [3]:

# repo with leaders + ff original + imposters + Google
def get_train_repo(params):
    
    out_repo = Repo_maj_pool(params['bsfldr'], params['frames'], params['steps'], np.mean, 5000)
    out_repo.add_ids({f: train_files[f] for f in ['bo','br','bs','cb','dt','ew','hc','jb','kh', 'pb',
                                                  'bo_imposter','bs_imposter','ew_imposter','dt_imposter',
                                                   'hc_imposter', 'jb_imposter']}, 0, 1)
    out_repo.add_ids({'FF_{0:03d}'.format(f): train_files['FF_{0:03d}'.format(f)] for f in range(1000)}, 0, 0.5)
    out_repo.add_ids({'GG_{0:02d}'.format(f+1): train_files['GG_{0:02d}'.format(f+1)] for f in range(28)}, 0, 1)
    out_repo.add_ids({'steve_b': train_files['steve_b']}, 0, 1)
    out_repo.add_ids({'jen_l': train_files['jen_l']}, 0, 1)
    
    out_repo.build_repo(params['N_comp'])
    
    return out_repo


def save_results(vgg_repo, fab_repo, test_ids, r_or_f, N_f, out_file, n1, n2, parallel=False):
        
    # number of files per id
    test_dict = {}
    for ids in test_ids:
        cur_test_files = test_files[r_or_f][ids]
        if N_f>0 and len(cur_test_files)>N_f:
            test_dict[ids] = cur_test_files[:N_f].copy()
        else:
            test_dict[ids] = cur_test_files.copy()
            
    vgg_result = vgg_repo.dist_using_dict(test_dict, n1, n2, parallel=parallel, dist='cosine')
    vgg_result['RealFake'] = r_or_f
    fab_result = fab_repo.dist_using_dict(test_dict, n1, n2, parallel=parallel, dist='cosine')
    fab_result['RealFake'] = r_or_f
    full_df = vgg_result.set_index('fileName').join(fab_result.set_index('fileName'), 
                                                 how='inner', lsuffix='_vgg', rsuffix='_fab')
    full_df = full_df[['predLabel_vgg', 'actualLabel_vgg', 'RealFake_vgg', 'predLabel_fab']].copy()
    
    full_df.to_csv(out_file)
    full_df = []

# VGG pool + Fabnet Pool

### Repo of Leaders + Face Forensics Original + Imposter + Google

In [8]:
vgg_repo = get_train_repo({'istrain':True, 'bsfldr':'/data/home/shruti/voxceleb/vgg/leaders/', 
                                'frames':100, 'steps':5, 'N_comp':256})

fab_repo = get_train_repo({'istrain':True, 'bsfldr':'/data/home/shruti/voxceleb/fabnet/leaders/', 
                                'frames':100, 'steps':5, 'N_comp':-1})


Number of labels 1046
pca components: 256, explained variance 0.9931591374128844
Build repo time: 125.959
Number of labels 1046
Build repo time: 1.848


In [None]:
# change the location of the base folder in the repo
vgg_repo.bs_fldr = '/data/home/shruti/voxceleb/vgg/leaders/'
fab_repo.bs_fldr = '/data/home/shruti/voxceleb/fabnet/leaders/'

save_results(vgg_repo, fab_repo, 
             ['GG_{0:02d}'.format(f+1) for f in range(28)], 
             'real', -1, 
             'GG_real_vggfabpool.csv', 0, 1, parallel=True)
save_results(vgg_repo, fab_repo, 
             ['GG_{0:02d}'.format(f+1) for f in range(28)], 
             'fake', -1, 
             'GG_fake_vggfabpool.csv', 0, 1, parallel=True)

save_results(vgg_repo, fab_repo, 
             ['bo','bs','dt','ew','hc','jb'], 
             'real', 10, 
             'ldr_real_vggfabpool.csv', 0, 1, parallel=True)
save_results(vgg_repo, fab_repo, 
             ['bo','bs','dt','ew','hc','jb'], 
             'fake', 10, 
             'ldr_fake_vggfabpool.csv', 0, 1, parallel=True)


In [None]:
test_files['fake']['FF_001']

In [9]:
save_results(vgg_repo, fab_repo, 
             ['FF_{0:03d}'.format(f) for f in range(1000)], 
             'fake', -1, 
             'FF_fake_vggfabpool.csv', 0.5, 1, parallel=True)



[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   29.7s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:  2.1min
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:  4.6min
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:  8.5min
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed: 13.2min
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed: 18.9min
[Parallel(n_jobs=16)]: Done 2000 out of 2000 | elapsed: 21.2min finished
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:   14.2s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:  1.7min
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:  4.2min
[Parallel(n_jobs=16)]: Done 768 tasks      | elapsed:  7.9min
[Parallel(n_jobs=16)]: Done 1218 tasks      | elapsed: 12.6min
[Parallel(n_jobs=16)]: Done 1768 tasks      | elapsed: 18.2min
[Parallel(n_jobs=16)]: Do

In [None]:
save_results(vgg_repo, fab_repo, 
             ['FF_{0:03d}'.format(f) for f in range(1000)], 
             'real', -1, 
             'FF_real_vggfabpool.csv', 0.5, 1, parallel=True)
save_results(vgg_repo, fab_repo, 
             ['jen_l'], 
             'real', -1, 
             'SJ_real_vggfabpool.csv', 0, 1, parallel=True)
save_results(vgg_repo, fab_repo, 
             ['steve_b'], 
             'fake', -1, 
             'SJ_fake_vggfabpool.csv', 0, 1, parallel=True)

In [None]:
real_df = pd.read_csv('GG_real_vggfabpool.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('GG_fake_vggfabpool.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))

In [None]:
real_df = pd.read_csv('ldr_real_vggfabpool.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('ldr_fake_vggfabpool.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))

In [None]:
real_df = pd.read_csv('FF_real_vggfabpool.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('FF_fake_vggfabpool.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))

In [None]:
real_df = pd.read_csv('SJ_real_vggfabpool.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('SJ_fake_vggfabpool.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))

# Metric Learning

### Repo of Leaders + Face Forensics Original + Imposter + Google

In [None]:

vgg_repo_metric = get_train_repo({'istrain':True, 'bsfldr':'/data/home/shruti/voxceleb/vgg/leaders/', 
                                'frames':100, 'steps':5, 'N_comp':256})
fab_repo_metric = get_train_repo({'istrain':True, 'bsfldr':'/data/home/shruti/voxceleb/fabnet_metric/', 
                                'frames':1, 'steps':1, 'N_comp':-1})


In [None]:
vgg_repo_metric.bs_fldr = '/data/home/shruti/voxceleb/vgg/leaders/'
fab_repo_metric.bs_fldr = '/data/home/shruti/voxceleb/fabnet_metric/'


save_results(vgg_repo_metric, fab_repo_metric, 
             ['GG_{0:02d}'.format(f+1) for f in range(28)], 
             'real', -1, 
             'GG_real_100_vggpoolfabmet.csv', 0, 1, parallel=True)
save_results(vgg_repo_metric, fab_repo_metric, 
             ['GG_{0:02d}'.format(f+1) for f in range(28)], 
             'fake', -1, 
             'GG_fake_100_vggpoolfabmet.csv', 0, 1, parallel=True)

save_results(vgg_repo_metric, fab_repo_metric, 
             ['bo','bs','dt','ew','hc','jb'], 
             'real', 10, 
             'ldr_real_100_vggpoolfabmet.csv', 0, 1, parallel=True)
save_results(vgg_repo_metric, fab_repo_metric, 
             ['bo','bs','dt','ew','hc','jb'], 
             'fake', 10, 
             'ldr_fake_100_vggpoolfabmet.csv', 0, 1, parallel=True)

save_results(vgg_repo_metric, fab_repo_metric, 
             ['FF_{0:03d}'.format(f) for f in range(1000)], 
             'real', -1, 
             'FF_real_100_vggpoolfabmet.csv', 0, 1, parallel=True)
save_results(vgg_repo_metric, fab_repo_metric, 
             ['FF_{0:03d}'.format(f) for f in range(1000)], 
             'fake', -1, 
             'FF_fake_100_vggpoolfabmet.csv', 0, 1, parallel=True)

save_results(vgg_repo_metric, fab_repo_metric, 
             ['jen_l'], 
             'real', -1, 
             'SJ_real_100_vggpoolfabmet.csv', 0, 1, parallel=True)
save_results(vgg_repo_metric, fab_repo_metric, 
             ['steve_b'], 
             'fake', -1, 
             'SJ_fake_100_vggpoolfabmet.csv', 0, 1, parallel=True)


In [None]:

real_df = pd.read_csv('GG_real_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('GG_fake_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))


In [None]:

real_df = pd.read_csv('ldr_real_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('ldr_fake_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))



In [None]:

real_df = pd.read_csv('FF_real_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('FF_fake_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))


In [None]:
real_df = pd.read_csv('SJ_real_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(real_df['predLabel_vgg'])==np.array(real_df['predLabel_fab'])).ravel())/len(real_df))

fake_df = pd.read_csv('SJ_fake_100_vggpoolfabmet.csv')
print(len(np.argwhere(np.array(fake_df['predLabel_vgg'])==np.array(fake_df['predLabel_fab'])).ravel())/len(fake_df))