## Structure

In [3]:
from collections import Counter
import numpy as np
import pandas as pd
import random

"""21 drivers : train
4 drivers test"""

def read_data(csv_path):
    drivers_df = pd.read_csv(driver_imgs_list_path)
    drivers_df['img'] = drivers_df['classname'] + '/' + drivers_df['img']
    return drivers_df.groupby(["classname", "subject"])['img'].apply(list).reset_index(name="imgs")

def split_subject(df, train_count=21, seed=42):
    unique_subjects = df['subject'].unique()
    random.seed(seed)
    random.shuffle(unique_subjects)
    return unique_subjects[:train_count], unique_subjects[train_count:]


data_directory = '/home/ubuntu/nbs/state_farm/data/'
driver_imgs_list_path = data_directory + 'driver_imgs_list.csv'
classname_subject_grouped = read_data(driver_imgs_list_path)
train_drivers, valid_drivers = split_subject(classname_subject_grouped)

train_df = classname_subject_grouped[classname_subject_grouped['subject'].isin(train_drivers)]
valid_df = classname_subject_grouped[classname_subject_grouped['subject'].isin(valid_drivers)]

print(train_df.head())


import os
imgs_directory = data_directory+ 'train/'
train_directory = data_directory + 'train_set/'
valid_directory = data_directory + 'valid_set/'
classnames = classname_subject_grouped['classname'].unique()

def make_dirs(dirs, classnames):
    for directory in dirs:
        if not os.path.exists(directory):
            os.makedirs(directory)
            for classname in classnames:
                os.makedirs(directory+classname)
        else:
            print('Directory %s already exists' %directory)

make_dirs([train_directory, valid_directory], classnames)


def get_flatten_list_from_column(df, col):
    imgs_2D = list(df[col])
    return [j for i in imgs_2D for j in i]

train_imgs = get_flatten_list_from_column(train_df, 'imgs')
valid_imgs = get_flatten_list_from_column(valid_df, 'imgs')
print(len(train_imgs))
print(len(valid_imgs))

from shutil import copy, copyfile

def copy_files(files_list, imgs_directory, dst_directory):
    for filename in files_list:
        classname = filename[:3]
        path = imgs_directory + filename
        dst = dst_directory + classname
        if os.listdir(dst_directory) == []:
            print('THE DIRECTORY %s IS NOT EMPTY!' %dst_directory)
            break
        copy(path, dst)
    
copy_files(valid_imgs, imgs_directory, valid_directory)
copy_files(train_imgs, imgs_directory, train_directory)

  classname subject                                               imgs
1        c0    p012  [c0/img_10206.jpg, c0/img_27079.jpg, c0/img_50...
2        c0    p014  [c0/img_72495.jpg, c0/img_62101.jpg, c0/img_34...
3        c0    p015  [c0/img_48693.jpg, c0/img_44903.jpg, c0/img_58...
4        c0    p016  [c0/img_51066.jpg, c0/img_19066.jpg, c0/img_13...
7        c0    p024  [c0/img_66836.jpg, c0/img_54333.jpg, c0/img_28...
Directory /home/ubuntu/nbs/state_farm/data/train_set/ already exists
Directory /home/ubuntu/nbs/state_farm/data/valid_set/ already exists
17616
4808


## Sample

In [14]:
import glob 

!mkdir /home/ubuntu/nbs/state_farm/data/sample


train_directory = '/home/ubuntu/nbs/state_farm/data/sample/train_set/'
valid_directory = '/home/ubuntu/nbs/state_farm/data/sample/valid_set/'

def make_dirs(dirs, classnames):
    for directory in dirs:
        if not os.path.exists(directory):
            os.makedirs(directory)
            for classname in classnames:
                os.makedirs(directory+classname)
        else:
            print('Directory %s already exists' %directory)
            
make_dirs([train_directory, valid_directory], classnames)

from shutil import copyfile

%cd /home/ubuntu/nbs/state_farm/data/train_set/
g = glob.glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(1500): 
    copyfile(shuf[i], '../sample/train_set/' + shuf[i])

%cd /home/ubuntu/nbs/state_farm/data/valid_set/
g = glob.glob('c?/*.jpg')
shuf = np.random.permutation(g)
for i in range(1000): 
    copyfile(shuf[i], '../sample/valid_set/' + shuf[i])

!mkdir -p /home/ubuntu/nbs/state_farm/data/sample/test/unknown/
    
%cd /home/ubuntu/nbs/state_farm/data/test
g = glob.glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(100): 
    copyfile(shuf[i], '../sample/test/unknown/' + shuf[i])


/home/ubuntu/nbs/state_farm/data/train_set
/home/ubuntu/nbs/state_farm/data/valid_set
/home/ubuntu/nbs/state_farm/data/test
