# Make train candidates

To mimic the set up of the test set

In [1]:
import pandas as pd
import random
import numpy as np
import os

# Load the data
train_data = pd.read_csv('data/train.csv')
print(train_data.head())

  left right
0  aaa   osr
1  aaz   mqw
2  abh   cdx
3  abn   uzp
4  abq   bbd


In [2]:
# Prepare columns
columns_names = ['left'] + ['c'+str(i) for i in range(20)]
print(columns_names)

['left', 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17', 'c18', 'c19']


In [4]:
# Get all file names in train/all folder
all_files = np.array([str(x.split('.')[0]) for x in os.listdir('data/train/all')])
# Get number of unique elements
unique = np.unique(all_files)
print(unique.shape, all_files.shape)

(4000,) (4000,)


all files are unique

In [5]:
records = []

for i in range(len(train_data)):
    left = train_data['left'][i]
    right = train_data['right'][i]
    
    # get 19 random elemnts of all_files except left and right
    idx = np.where((all_files != left) & (all_files != right))[0]
    candidates = all_files[np.random.choice(idx, 19, replace=False)].astype(str)
    candidates = np.concatenate((np.array([right]), candidates))
    # shuffle the candidates
    random.shuffle(candidates)

    # Create a record
    record = [left] + candidates.tolist()
    records.append(record)

df = pd.DataFrame(data=records, columns=columns_names)
df.head()

Unnamed: 0,left,c0,c1,c2,c3,c4,c5,c6,c7,c8,...,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19
0,aaa,zzz,szv,kai,aan,iki,jub,xdr,egz,fad,...,osr,ixm,izx,pxc,epe,ifd,jqu,vpy,cfy,vgw
1,aaz,xav,mqw,ury,nsw,xmk,llu,myx,kxg,sfd,...,moi,gna,bin,mmj,xmi,duy,zdx,kxj,wck,oyd
2,abh,alm,ufo,ytr,waw,psd,ohf,mye,wpc,wci,...,fhn,vsm,cdx,urg,maq,pdc,twv,mas,tys,mvv
3,abn,ztx,bej,uqv,gfm,xuo,xhc,gxm,tos,tfh,...,ddh,aqm,mwv,qki,tmh,emk,mtv,wzb,mut,uzp
4,abq,snm,kzd,fsy,gkl,ivf,dzk,mih,hqd,ipb,...,ueo,kuc,sig,cjc,vug,gvz,hyy,bbd,jcf,ujz


In [6]:
# Check the correct image is in the dataframe
try:
    for i in range(len(train_data)):
        if not train_data['right'][i] in df.iloc[i].values:
            raise Exception('Error: missing true image on index', i)
    print('All good!')    
except:
    print('Process terminated')

All good!


In [7]:
# Write dataset to csv
df.to_csv('data/train_candidates.csv', index=False)