# Import Libraries

In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.image import ImageDataGenerator
import os
import splitfolders

# Obtain & Pre-Process Data

Perform train test split on image folders for each class

In [65]:
input_folder = 'Data/class_data'
output_folder = 'Data/ttsplit_data'
classes = ['paper', 'rock', 'scissors']

splitfolders.ratio(input_folder, output=output_folder, seed=37, ratio=(.64, .2, .16))

# train val test


Copying files: 0 files [00:00, ? files/s][A
Copying files: 106 files [00:00, 1055.18 files/s][A
Copying files: 232 files [00:00, 1098.57 files/s][A
Copying files: 350 files [00:00, 1120.27 files/s][A
Copying files: 477 files [00:00, 1161.09 files/s][A
Copying files: 584 files [00:00, 1130.06 files/s][A
Copying files: 713 files [00:00, 1136.23 files/s][A
Copying files: 856 files [00:00, 1210.53 files/s][A
Copying files: 992 files [00:00, 1249.30 files/s][A
Copying files: 1140 files [00:00, 1308.88 files/s][A
Copying files: 1274 files [00:01, 1316.51 files/s][A
Copying files: 1404 files [00:01, 1237.04 files/s][A
Copying files: 1528 files [00:01, 1085.72 files/s][A
Copying files: 1640 files [00:01, 1093.60 files/s][A
Copying files: 1752 files [00:01, 1099.69 files/s][A
Copying files: 1896 files [00:01, 1183.48 files/s][A
Copying files: 2018 files [00:01, 1146.14 files/s][A
Copying files: 2188 files [00:01, 1174.14 files/s][A


Determine batch size of images from all 3 classes

In [66]:
train_folder = 'Data/ttsplit_data/train'
test_folder = 'Data/ttsplit_data/test'
val_folder = 'Data/ttsplit_data/val'
classes = ['paper', 'rock', 'scissors']

train_imgs = []
test_imgs = []
val_imgs = []

for img_class in classes:
    train_imgs.extend([file for file in os.listdir(train_folder+'/'+img_class) if file.endswith('.png')])
    test_imgs.extend([file for file in os.listdir(test_folder+'/'+img_class) if file.endswith('.png')])
    val_imgs.extend([file for file in os.listdir(val_folder+'/'+img_class) if file.endswith('.png')])
    
train_batch_size = len(train_imgs)
test_batch_size = len(test_imgs)
val_batch_size = len(val_imgs)

print(train_batch_size)
print(test_batch_size)
print(val_batch_size)

1399
352
437


Generate data from the images

In [90]:
train_generator = ImageDataGenerator().flow_from_directory(
                        output_folder+'/train', target_size=(300, 200), batch_size = train_batch_size)

test_generator = ImageDataGenerator().flow_from_directory(
                        output_folder+'/test', target_size=(300, 200), batch_size = test_batch_size) 

val_generator = ImageDataGenerator().flow_from_directory(
                        output_folder+'/val', target_size=(300, 200), batch_size = val_batch_size) 

Found 1399 images belonging to 3 classes.
Found 352 images belonging to 3 classes.
Found 437 images belonging to 3 classes.


Split image data into image and label variables

In [91]:
train_images, train_labels = next(train_generator)
test_images, test_labels = next(test_generator)
val_images, val_labels = next(val_generator)

# Explore Data

Calculate shape of image data

In [93]:
print('Train images shape: ', train_images.shape)
print('Test images shape: ', test_images.shape)
print('Val images shape: ', val_images.shape)

Train images shape:  (1399, 300, 200, 3)
Test images shape:  (352, 300, 200, 3)
Val images shape:  (437, 300, 200, 3)


Calculate shape of label data

In [94]:
print('Train labels shape: ', train_labels.shape)
print('Test labels shape: ', test_labels.shape)
print('Val labels shape: ', val_labels.shape)

Train labels shape:  (1399, 3)
Test labels shape:  (352, 3)
Val labels shape:  (437, 3)


Calculate label class distribution

In [114]:
train_labels_df = pd.DataFrame(train_labels)
test_labels_df = pd.DataFrame(test_labels)
val_labels_df = pd.DataFrame(val_labels)
total_labels_df = pd.concat([train_labels_df, test_labels_df, val_labels_df], axis=0)


print('Train set class distribution')
print(train_labels_df.mean())

print('Test set class distribution')
print(test_labels_df.mean())

print('Val set class distribution')
print(val_labels_df.mean())

print('Overall class distribution')
print(total_labels_df.mean())

Train set class distribution
0    0.325232
1    0.331665
2    0.343102
dtype: float32
Test set class distribution
0    0.326705
1    0.332386
2    0.340909
dtype: float32
Val set class distribution
0    0.324943
1    0.331808
2    0.343249
dtype: float32
Overall class distribution
0    0.325411
1    0.331810
2    0.342779
dtype: float32


Plot label class distribution

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)

# Model Data

In [88]:
train_generator[0][1].shape

(1399, 3)

In [77]:
train_labels[0:20]

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]], dtype=float32)

In [123]:
def dedummy(label_list):
    dedummy_list = []
    for item in label_list:
        if item[0] == 1:
            dedummy_list.append(0)
        elif item[1] == 1:
            dedummy_list.append(1)
        elif item[2] == 1:
            dedummy_list.append(2)
    return dedummy_list

train_classes = dedummy(train_labels)
test_classes = dedummy(test_labels)
val_classes = dedummy(val_labels)