In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train_labels = pd.read_csv('./data/trainLabels.csv')
train_labels

Unnamed: 0,id,label
0,1,frog
1,2,truck
2,3,truck
3,4,deer
4,5,automobile
...,...,...
49995,49996,bird
49996,49997,frog
49997,49998,truck
49998,49999,automobile


In [4]:
sample_submission = pd.read_csv('./data/sampleSubmission.csv')
sample_submission

Unnamed: 0,id,label
0,1,cat
1,2,cat
2,3,cat
3,4,cat
4,5,cat
...,...,...
299995,299996,cat
299996,299997,cat
299997,299998,cat
299998,299999,cat


## Split to Train/Valid

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
ids = train_labels['id'].values
labels = train_labels['label'].values

In [7]:
train_id, valid_id, train_label, valid_label = train_test_split(ids, labels, test_size=0.1, random_state=42, stratify=labels)

In [8]:
train_id.shape, train_label.shape, valid_id.shape, valid_label.shape

((45000,), (45000,), (5000,), (5000,))

In [9]:
from collections import Counter
print(Counter(train_label))
print(Counter(valid_label))

Counter({'automobile': 4500, 'frog': 4500, 'bird': 4500, 'ship': 4500, 'horse': 4500, 'cat': 4500, 'dog': 4500, 'deer': 4500, 'truck': 4500, 'airplane': 4500})
Counter({'horse': 500, 'bird': 500, 'cat': 500, 'truck': 500, 'automobile': 500, 'ship': 500, 'airplane': 500, 'dog': 500, 'deer': 500, 'frog': 500})


In [88]:
!mkdir -p valid

In [90]:
import shutil
for n in valid_id:
    src = './data/train/{}.png'.format(n)
    tgt = './dat/valid/{}.png'.format(n)
    shutil.move(src, tgt)

## Assigning images to a directory by class
- To use the flow_from_directory
- The test data is unknown class

In [10]:
classes = set(train_label)

In [11]:
classes

{'airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck'}

In [93]:
import os
for cls in classes:
    os.makedirs('./data/train/{}'.format(cls), exist_ok=True)
    os.makedirs('./data/valid/{}'.format(cls), exist_ok=True)

In [95]:
import shutil
for n, cls in zip(train_id, train_label):
    src = './data/train/{}.png'.format(n)
    tgt = './data/train/{}/{}.png'.format(cls, n)
    shutil.move(src, tgt)

In [96]:
for n, cls in zip(valid_id, valid_label):
    src = './data/valid/{}.png'.format(n)
    tgt = './data/valid/{}/{}.png'.format(cls, n)
    shutil.move(src, tgt)

## ImageDataGenerator test

In [13]:
from keras.preprocessing.image import ImageDataGenerator

In [14]:
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
    './data/train',
    target_size=(32, 32),
    batch_size=500
)

valid_datagen = ImageDataGenerator(rescale=1./255)
valid_generator = valid_datagen.flow_from_directory(
    './data/valid',
    target_size=(32, 32),
    batch_size=500
)

Found 45000 images belonging to 10 classes.
Found 5000 images belonging to 10 classes.


In [15]:
for x, y in train_generator:
    print(x.shape, y.shape)
    break

(500, 32, 32, 3) (500, 10)


In [16]:
np.min(x), np.max(x)

(0.0, 1.0)

In [17]:
train_generator.class_indices

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [18]:
valid_generator.class_indices

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [19]:
train_generator.batch_size

500

In [20]:
train_generator.n

45000