In this notebook the daytime images selected by using the previous notebook "01-moving_files.ipynb" are separated in train, validation, and test directories.

In [1]:
import pandas as pd 
import numpy as np
import os
import glob
import shutil

In [2]:
# folders where daytime images SIM and NAO are

base_dir = "../data/TTG_TUR/"
base_dir_SIM = "../data/TTG_TUR/SIM/"
base_dir_NAO = "../data/TTG_TUR/NAO/"

In [3]:
# How many images SIM and NAO

from os import listdir
from os.path import isfile, join

list_SIM = [f for f in listdir(base_dir_SIM) if isfile(join(base_dir_SIM, f))]
list_NAO = [f for f in listdir(base_dir_NAO) if isfile(join(base_dir_NAO, f))]

print("Number of images with label SIM : ", len(list_SIM))
print("Number of images with label NAO : ", len(list_NAO))

print("SIM:{} %".format(round(len(list_SIM)/(len(list_SIM)+len(list_NAO)),2)*100))
print("NAO:{} %".format(round(len(list_NAO)/(len(list_SIM)+len(list_NAO)),2)*100))


Number of images with label SIM :  1504
Number of images with label NAO :  4821
SIM:24.0 %
NAO:76.0 %


As we can see the data is imbalanced **24% SIM** and **76% NAO**.

# Create directories to copy images splitted by train, validation, and test

In [4]:
# Directories for our training,
# validation and test splits

train_dir = os.path.join(base_dir, 'train')
if not os.path.exists(train_dir):
    os.mkdir(train_dir)

validation_dir = os.path.join(base_dir, 'validation')
if not os.path.exists(validation_dir):
    os.mkdir(validation_dir)

test_dir = os.path.join(base_dir, 'test')
if not os.path.exists(test_dir):
    os.mkdir(test_dir)


# Directory with our training images
train_SIM_dir = os.path.join(train_dir, 'SIM')
if not os.path.exists(train_SIM_dir):
    os.mkdir(train_SIM_dir)

# Directory with our training images
train_NAO_dir = os.path.join(train_dir, 'NAO')
if not os.path.exists(train_NAO_dir):
    os.mkdir(train_NAO_dir)

# Directory with our validation images
validation_SIM_dir = os.path.join(validation_dir, 'SIM')
if not os.path.exists(validation_SIM_dir):
    os.mkdir(validation_SIM_dir)

# Directory with our validation images
validation_NAO_dir = os.path.join(validation_dir, 'NAO')
if not os.path.exists(validation_NAO_dir):
    os.mkdir(validation_NAO_dir)

# Directory with our test images
test_SIM_dir = os.path.join(test_dir, 'SIM')
if not os.path.exists(test_SIM_dir):
    os.mkdir(test_SIM_dir)

# Directory with our test images
test_NAO_dir = os.path.join(test_dir, 'NAO')
if not os.path.exists(test_NAO_dir):
    os.mkdir(test_NAO_dir)


# Create a dataframe with name of the images and labels (0=NAO, 1=SIM)

In [5]:
# build dataframe

list_images = list_SIM.copy()
list_images.extend(list_NAO)

list_images = [name.split('.')[-2] for name in list_images]

df = pd.DataFrame({'image_name':list_images})

In [6]:
def create_labels(image):
    
    if image.split('.')[0].split('_')[-1] == 'SIM':
        return 1
    else:
        return 0
    
df['label'] = df['image_name'].apply(create_labels) 

In [7]:
df[df.label == 0].head()

Unnamed: 0,image_name,label
1504,2017-12-12 11-54-05_NAO,0
1505,2017-12-12 11-56-30_NAO,0
1506,2017-12-12 11-56-38_NAO,0
1507,2017-12-12 13-00-22_NAO,0
1508,2017-12-18 09-27-14_NAO,0


In [8]:
df[df.label == 1].head()

Unnamed: 0,image_name,label
0,2017-12-12 11-53-41_SIM,1
1,2017-12-12 11-54-15_SIM,1
2,2017-12-12 11-55-28_SIM,1
3,2017-12-12 11-56-54_SIM,1
4,2017-12-12 12-58-24_SIM,1


In [9]:
# open existing file to compare

df2 = pd.read_csv("../data/info_day_images.csv")

# rearranging before comparing

df=df.sort_values("image_name").reset_index(drop=True)
df2=df2.sort_values("image_name").reset_index(drop=True)

#comparing with dataframe generated while selecting daytime images 

df2.equals(df)

True

# Split data in train, validation, and test

In [10]:
# randomly shuffling list_SIM and list_NAO before spliting images in train, validation, and test

import random

random.seed(123)

list_SIM_shuffled = list_SIM.copy()

random.shuffle(list_SIM_shuffled)

print(list_SIM[:10])
print(list_SIM_shuffled[:10])

['2017-12-12 11-53-41_SIM.jpeg', '2017-12-12 11-54-15_SIM.jpeg', '2017-12-12 11-55-28_SIM.jpeg', '2017-12-12 11-56-54_SIM.jpeg', '2017-12-12 12-58-24_SIM.jpeg', '2017-12-18 08-56-16_SIM.jpeg', '2017-12-18 09-29-10_SIM.jpeg', '2018-01-09 15-31-06_SIM.jpeg', '2018-01-09 15-33-26_SIM.jpeg', '2018-01-09 15-34-22_SIM.jpeg']
['2018-01-09 15-34-22_SIM.jpeg', '2018-02-25 11-58-05_SIM.jpeg', '2018-03-21 12-34-12_SIM.jpeg', '2018-12-02 08-48-12_SIM.jpeg', '2018-03-17 17-42-13_SIM.jpeg', '2018-01-30 16-12-12_SIM.jpeg', '2018-01-30 11-22-07_SIM.jpeg', '2018-02-25 14-41-51_SIM.jpeg', '2019-02-02 08-21-03_SIM.jpeg', '2019-03-10 08-30-52_SIM.jpeg']


In [11]:
random.seed(123)

list_NAO_shuffled = list_NAO.copy()

random.shuffle(list_NAO_shuffled)

print(list_NAO[:10])
print(list_NAO_shuffled[:10])

['2017-12-12 11-54-05_NAO.jpeg', '2017-12-12 11-56-30_NAO.jpeg', '2017-12-12 11-56-38_NAO.jpeg', '2017-12-12 13-00-22_NAO.jpeg', '2017-12-18 09-27-14_NAO.jpeg', '2018-01-09 15-42-26_NAO.jpeg', '2018-01-09 16-17-15_NAO.jpeg', '2018-01-09 16-22-07_NAO.jpeg', '2018-01-09 16-26-53_NAO.jpeg', '2018-01-09 16-56-54_NAO.jpeg']
['2018-03-16 08-42-18_NAO.jpeg', '2018-03-18 07-42-46_NAO.jpeg', '2018-03-17 14-30-18_NAO.jpeg', '2018-03-08 08-51-43_NAO.jpeg', '2018-03-20 07-36-46_NAO.jpeg', '2018-03-20 09-33-16_NAO.jpeg', '2018-03-14 15-53-35_NAO.jpeg', '2018-03-18 14-29-10_NAO.jpeg', '2018-03-18 11-06-22_NAO.jpeg', '2018-03-17 10-50-50_NAO.jpeg']


In [12]:
import math

def split_data(list_data, base_dir, train_dir, validation_dir, test_dir, percentage_train = 0.8):
    """ Split data in train, validation, and test directories.
    
    Input:
        - list_data: list with name of images
        - base_dir: main directory that contains train, validation, and test directory
        - train_dir: diretory containing images that will be used to train model
        - validation_dir: diretory containing images that will be used to validate model
        - test_dir: diretory containing images that will be used to test final model
        - percentage_train : If not defined 0.8
    
    """
    
    n_samples = len(list_data)
    
    pos1 = math.ceil(percentage_train*n_samples)
    print(pos1)

    pos2 = pos1 + math.ceil((n_samples - pos1)/2)
    print(pos2)
    
    fnames = list_data[:pos1]

    for fname in fnames:
        scr = os.path.join(base_dir,fname)
        dst = os.path.join(train_dir, fname)
        shutil.copyfile(scr,dst)

    fnames = list_data[pos1:pos2]

    for fname in fnames:
        scr = os.path.join(base_dir,fname)
        dst = os.path.join(validation_dir, fname)
        shutil.copyfile(scr,dst)

    fnames = list_data[pos2:]

    for fname in fnames:
        scr = os.path.join(base_dir,fname)
        dst = os.path.join(test_dir, fname)
        shutil.copyfile(scr,dst)


In [13]:
# splitting data with label SIM in training, validation and test

list_data = list_SIM_shuffled
base_dir = base_dir_SIM
train_dir = train_SIM_dir
validation_dir = validation_SIM_dir
test_dir = test_SIM_dir

split_data(list_data, base_dir, train_dir, validation_dir, test_dir, percentage_train = 0.8)

1204
1354


In [14]:
# splitting data with label NAO in training, validation and test

list_data = list_NAO_shuffled
base_dir = base_dir_NAO
train_dir = train_NAO_dir
validation_dir = validation_NAO_dir
test_dir = test_NAO_dir

split_data(list_data, base_dir, train_dir, validation_dir, test_dir, percentage_train = 0.8)

3857
4339


In [15]:
print('total training SIM images:', len(os.listdir(train_SIM_dir)))
print('total training NAO images:', len(os.listdir(train_NAO_dir)))

print('total validation SIM images:', len(os.listdir(validation_SIM_dir)))
print('total validation NAO images:', len(os.listdir(validation_NAO_dir)))

print('total test SIM images:', len(os.listdir(test_SIM_dir)))
print('total test NAO images:', len(os.listdir(test_NAO_dir)))

total training SIM images: 1204
total training NAO images: 3857
total validation SIM images: 150
total validation NAO images: 482
total test SIM images: 150
total test NAO images: 482


In [16]:
train_SIM_dir

'../data/TTG_TUR/train\\SIM'

In [17]:
os.getcwd()

'C:\\PROJECTS_2019\\project_omdena_firebreak_detection_REVIEW\\notebooks'