In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os,errno
import math
import random
import shutil

In [5]:
np.random.seed = 10
random.seed = 100

In [6]:
df_y = pd.read_csv('../Selfie-dataset/selfie_dataset.txt',sep="\s+",header=None,usecols=[0,3],names=['name','sex'])
print(df_y.shape)

(46836, 2)


In [7]:
num_samples = df_y.shape[0]
num_train = math.floor(num_samples*0.9)
num_test = num_samples - num_train
print("Total samples", num_samples)
print("Total training samples", num_train)
print("Total test samples", num_test)

Total samples 46836
Total training samples 42152
Total test samples 4684


In [9]:
## shuffle the contents of the directory and split them in train and test folders
files = [i for i in os.listdir('../Selfie-dataset/images/')]
print(files[0])
print(len(files))
np.random.shuffle(files)
try:
    os.mkdir('../Selfie-dataset/images/Train')
    os.mkdir('../Selfie-dataset/images/Test')
    for i,j in enumerate(files):
        if i <= num_train:
            shutil.move(os.path.join('../Selfie-dataset/images/',j), os.path.join('../Selfie-dataset/images/Train/',j))
        else:
            shutil.move(os.path.join('../Selfie-dataset/images/',j), os.path.join('../Selfie-dataset/images/Test/',j))
except OSError as e:
    if e.errno != errno.EEXIST:
        raise


Augmentation
3


In [10]:
train_name = [i[:-4] for i in os.listdir('../Selfie-dataset/images/Train/')]
test_name = [i[:-4] for i in os.listdir('../Selfie-dataset/images/Test/')]
y_train_df = df_y[df_y['name'].isin(train_name)]
y_test_df = df_y[df_y['name'].isin(test_name)]

In [11]:
##some validation
print("y_train_shape", y_train_df.shape)
print("y_test_shape",y_test_df.shape)
print("100th train file name", y_train_df.iloc[100])
print("100th test file name", y_test_df.iloc[100])

y_train_shape (42153, 2)
y_test_shape (4683, 2)
100th train file name name    10004126_244002322450107_1223819319_a
sex                                         1
Name: 112, dtype: object
100th test file name name    10005526_625500830878042_699529605_a
sex                                        1
Name: 992, dtype: object


In [27]:
import tensorflow as tf
from tensorflow.contrib.data import Dataset, Iterator
from tensorflow.python.framework.ops import convert_to_tensor
from tensorflow.python.framework import dtypes
tf.set_random_seed(100)

In [18]:
train_images = [os.path.abspath(os.path.join(os.sep,'Selfie-dataset/images/Train/',i)) for i in os.listdir('../Selfie-dataset/images/Train/')]
test_images = [os.path.abspath(os.path.join(os.sep,'Selfie-dataset/images/Test/',i)) for i in os.listdir('../Selfie-dataset/images/Test/')]


In [17]:
train_images[:2]

['D:\\Selfie-dataset\\images\\Train\\00a454da495e11e28a7322000a1fa414_6.jpg',
 'D:\\Selfie-dataset\\images\\Train\\00cddb96ac4c11e3a30212279ba1b65f_6.jpg']

In [20]:
train_images_tf = convert_to_tensor(train_images,dtype=tf.string)
test_images_tf = convert_to_tensor(test_images,dtype=tf.string)

In [25]:
train_labels_tf = convert_to_tensor(list(y_train_df['sex']),dtype=dtypes.int32)
test_labels_tf = convert_to_tensor(list(y_test_df['sex']),dtype = dtypes.int32)

In [26]:
train_data = Dataset.from_tensor_slices((train_images_tf,train_labels_tf))
test_data  = Dataset.from_tensor_slices((test_images_tf,test_labels_tf))


In [30]:
NUM_CLASSES = 2
IMAGE_SIZE = 300
def distort_img_train(img_file, labels):
    one_hot_labels = tf.one_hot(labels,NUM_CLASSES)
    img_str = tf.read_file(img_file)
    img_decode = tf.image.decode_jpeg(img,channels=3)
    img_resize = tf.image.resize_image_with_crop_or_pad(img_decode,target_height=IMAGE_SIZE,target_width=IMAGE_SIZE)
    distort_img = tf.image.random_flip_left_right(img_resize,seed=tf.set_random_seed(10))
    img = tf.image.random_brightness(distort_img,max_delta=2.0,seed=10)
    img_std = tf.image.per_image_standardization(img)
    
    return img_std,one_hot_labels
    
    

In [29]:
def distort_img_test(img_file, labels):
    one_hot_labels = tf.one_hot(labels,NUM_CLASSES)
    img_str = tf.read_file(img_file)
    img_decode = tf.image.decode_jpeg(img,channels=3)
    img_resize = tf.image.resize_image_with_crop_or_pad(img_decode,target_height=IMAGE_SIZE,target_width=IMAGE_SIZE)
    img_std = tf.image.per_image_standardization(img_resize)
    
    return img_std,one_hot_labels

In [31]:
def get_data(batch_size,is_train_data = True,num_threads = 6):
    if is_train_data:
        data = train_data.map(distort_img_train,num_threads=num_threads)
    else:
        data = test_data.map(distort_img_test, num_threads = num_threads)
    data = data.batch(batch_size)
    return data