## Setup

Initial setup steps for https://www.kaggle.com/c/state-farm-distracted-driver-detection:

- Create "nbs" folder with ipython notebooks and add this notebook to the folder
- Back to the base directory, download kaggle data: kg download -u `username` -p `password` -c state-farm-distracted-driver-detection
- Unzip then remove *.zip

In [3]:
import glob, numpy as np, os, pandas as pd, random, shutil
from shutil import copyfile

base_dir = '../'

## Folder Structure

### Initial train, validation and test sets

In [3]:
if not os.path.exists(base_dir + 'models'): 
    os.makedirs(base_dir + 'models')

if not os.path.exists(base_dir + 'valid'): 
    os.makedirs(base_dir + 'valid')
    [os.mkdir('{}/c{}'.format(base_dir + 'valid', i)) for i in range(10)]
    
if not os.path.exists(base_dir + 'test'): 
    os.makedirs(base_dir + 'test')
    os.makedirs(base_dir + 'test/unknown')

### Sample sets

In [None]:
if not os.path.exists(base_dir + 'sample'): 
    os.makedirs(base_dir + 'sample')
    os.makedirs(base_dir + 'sample/models')
    os.makedirs(base_dir + 'sample/train')
    [os.mkdir('{}/c{}'.format(base_dir + 'sample/train', i)) for i in range(10)]
    os.makedirs(base_dir + 'sample/valid')
    [os.mkdir('{}/c{}'.format(base_dir + 'sample/valid', i)) for i in range(10)]
    os.makedirs(base_dir + 'sample/test')
    os.makedirs(base_dir + 'sample/test/unknown')

### Bounding Box Detection Sets

In [None]:
if not os.path.exists(base_dir + 'bb_train'):
    os.makedirs(base_dir + 'bb_train')
    os.makedirs(base_dir + 'bb_train/unknown')
    
if not os.path.exists(base_dir + 'bb_valid'):
    os.makedirs(base_dir + 'bb_valid')
    os.makedirs(base_dir + 'bb_valid/unknown')

### Bounding Box Cropped Sets

In [None]:
if not os.path.exists(base_dir + 'cropped_train'): 
    os.makedirs(base_dir + 'cropped_train')
    [os.mkdir('{}/c{}'.format(base_dir + 'cropped_train', i)) for i in range(10)]

if not os.path.exists(base_dir + 'cropped_valid'): 
    os.makedirs(base_dir + 'cropped_valid')
    [os.mkdir('{}/c{}'.format(base_dir + 'cropped_valid', i)) for i in range(10)]
    
if not os.path.exists(base_dir + 'cropped_test'): 
    os.makedirs(base_dir + 'cropped_test')
    os.makedirs(base_dir + 'cropped_test/unknown')

## Validation Set Generation

In [42]:
imgs_table = pd.read_csv(base_dir + 'driver_imgs_list.csv')
driver_ids = df['subject'].unique().tolist()
valid_driver_ids = random.sample(driver_ids, 5)
valid_imgs_table = df[df['subject'].isin(valid_driver_ids)]
print('trn/valid ratio: ' + str(valid_imgs_table.shape[0]/imgs_table.shape[0]) + '% (proceed if around 20%)')

4574 /22424 -> 0.20397788084195506% validation


In [44]:
for index, row in valid_imgs_table.iterrows():
    src_path = base_dir + 'train/' + row['classname'] + '/' + row['img']
    dest_path = base_dir + 'valid/' + row['classname']
    shutil.move(src_path, dest_path)

## Sample Sets Generation

In [46]:
%cd /home/ubuntu/fast/kaggle/farmer/train
g = glob.glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(1500): copyfile(shuf[i], '../sample/train/' + shuf[i])
    
%cd /home/ubuntu/fast/kaggle/farmer/valid
g = glob.glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(1000): copyfile(shuf[i], '../sample/valid/' + shuf[i])

/home/ubuntu/fast/kaggle/farmer/train


## Bounding Box Detection Sets Generation

In [10]:
%cd /home/ubuntu/fast/kaggle/farmer/train
g = glob.glob('c?/*.jpg')
shuf = np.random.permutation(g)
# print(shuf[0].split('/')[-1])
for i in range(360): copyfile(shuf[i], '../bb_train/unknown/' + shuf[i].split('/')[-1])
    
%cd /home/ubuntu/fast/kaggle/farmer/valid
g = glob.glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(180): copyfile(shuf[i], '../bb_valid/unknown/' + shuf[i].split('/')[-1])

/home/ubuntu/fast/kaggle/farmer/train
/home/ubuntu/fast/kaggle/farmer/valid


In [3]:
from IPython.display import FileLink
FileLink("/home/ubuntu/fast/kaggle/farmer/bb_valid.zip") # Download for manual labeling

In [4]:
FileLink("/home/ubuntu/fast/kaggle/farmer/bb_train.zip") # Download for manual labeling

## Dividing Test Set for Pseudo Labeling

In [2]:
# Divide test set in 4 for pseudo labeling
if not os.path.exists(base_dir + 'test1'): 
    os.makedirs(base_dir + 'test1')
    os.makedirs(base_dir + 'test1/unknown')
    os.makedirs(base_dir + 'test2')
    os.makedirs(base_dir + 'test2/unknown')
    os.makedirs(base_dir + 'test3')
    os.makedirs(base_dir + 'test3/unknown')
    os.makedirs(base_dir + 'test4')
    os.makedirs(base_dir + 'test4/unknown') 

In [4]:
%cd /home/ubuntu/fast/kaggle/farmer/test
g = glob.glob('unknown/*.jpg')
shuf = np.random.permutation(g)
for i in range(79726):
    if i % 4 == 0:
        copyfile(shuf[i], '../test1/' + shuf[i])
    if i % 4 == 1:
        copyfile(shuf[i], '../test2/' + shuf[i])
    if i % 4 == 2:
        copyfile(shuf[i], '../test3/' + shuf[i])
    if i % 4 == 3:
        copyfile(shuf[i], '../test4/' + shuf[i])

/home/ubuntu/fast/kaggle/farmer/test
