In [1]:
import re, math, os, cv2, random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

### Notes

The following notebook is used to create TFrecords of the Flickr Faces Dataset. I want to do this so that I can train a CycleGAN on TPU's with real faces and fake faces.

Check out the main model notebook here -> [Cycle GAN](https://www.kaggle.com/brendanartley/cyclegan-kaokore-model)


--

NOTE: I ended up leaving out about 15% of the data because the notebook output was too large for the Kaggle environment. We still get over 50,000 high quality images!

--

Datasets:

- [Arnaud58's Flickr Faces Data](https://www.kaggle.com/arnaud58/flickrfaceshq-dataset-ffhq) - 52,000 png images of faces scraped from Flickr with size 512x512

- [Kaokore Dataset](https://github.com/rois-codh/kaokore) - 8,000 faces from japanese paintings in the 16th century with size 256x256

-- 



In [2]:
#setting seeds for reproducability
SEED = 3141

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(SEED) 

### TFrecord Functions

In [3]:
def _bytes_feature(value):
    #Returns a bytes_list from a string / byte.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    #Returns a float_list from a float / double.
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    #Returns an int64_list from a bool / enum / int / uint.
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
def serialize_example(image, image_name):
    feature = {
      'image': _bytes_feature(image),
      'image_id': _bytes_feature(image_name),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

### Flickr Faces TFrecords

In [5]:
IMG_SIZE = 512
N_FILES = 30 #number of TFrecords created
HEIGHT, WIDTH = (512, 512)
IMG_QUALITY = 100
PATH = '../input/flickrfaceshq-dataset-ffhq'
PATH_TO_IMG = '../input/flickrfaceshq-dataset-ffhq/'

IMGS = os.listdir(PATH)
print("Number of Images: {}".format(len(IMGS)))

#creating data_frame with image names
IMGS.sort()
train_df = pd.DataFrame({'image_id': IMGS})

Number of Images: 52001


In [6]:
#dropping 15% of the image data
train_df.drop(train_df.tail(round(len(train_df)*.15)).index,inplace=True)

In [7]:
folds = KFold(n_splits=N_FILES, shuffle=True, random_state=SEED)
train_df['file']=-1

for fold_n, (train_idx, val_idx) in enumerate(folds.split(train_df)):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    train_df['file'].loc[val_idx] = fold_n

File: 1 has 1474 samples
File: 2 has 1474 samples
File: 3 has 1474 samples
File: 4 has 1474 samples
File: 5 has 1474 samples
File: 6 has 1474 samples
File: 7 has 1474 samples
File: 8 has 1474 samples
File: 9 has 1474 samples
File: 10 has 1474 samples
File: 11 has 1474 samples
File: 12 has 1473 samples
File: 13 has 1473 samples
File: 14 has 1473 samples
File: 15 has 1473 samples
File: 16 has 1473 samples
File: 17 has 1473 samples
File: 18 has 1473 samples
File: 19 has 1473 samples
File: 20 has 1473 samples
File: 21 has 1473 samples
File: 22 has 1473 samples
File: 23 has 1473 samples
File: 24 has 1473 samples
File: 25 has 1473 samples
File: 26 has 1473 samples
File: 27 has 1473 samples
File: 28 has 1473 samples
File: 29 has 1473 samples
File: 30 has 1473 samples


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Note: Make sure the PATH_TO_IMG variable has a forward slash at the end of it otherwise it will not be able to find any of the image! 

EXAMPLE: ../input/flickrfaceshq-dataset-ffhq/

In [8]:
for tfrec_num in range(N_FILES):
    print('\nWriting TFRecord %i of %i...'%(tfrec_num, N_FILES))
    samples = train_df[train_df['file'] == tfrec_num]
    n_samples = len(samples)
    print(f'{n_samples} samples')
    with tf.io.TFRecordWriter('FlickrFaces%.2i-%i.tfrec'%(tfrec_num, n_samples)) as writer:
        for row in samples.itertuples():
            image_name = row.image_id
            img_path = f'{PATH_TO_IMG}{image_name}'
            try:
                img = cv2.imread(img_path)
                img = cv2.resize(img, (HEIGHT, WIDTH))
                img = cv2.imencode('.png', img, (cv2.IMWRITE_JPEG_QUALITY, IMG_QUALITY))[1].tostring()
            except:
                print('Error: {} not added'.format(image_name))
                continue
            
            example = serialize_example(img, str.encode(image_name))
            writer.write(example)


Writing TFRecord 0 of 30...
1474 samples

Writing TFRecord 1 of 30...
1474 samples

Writing TFRecord 2 of 30...
1474 samples

Writing TFRecord 3 of 30...
1474 samples

Writing TFRecord 4 of 30...
1474 samples

Writing TFRecord 5 of 30...
1474 samples

Writing TFRecord 6 of 30...
1474 samples

Writing TFRecord 7 of 30...
1474 samples

Writing TFRecord 8 of 30...
1474 samples

Writing TFRecord 9 of 30...
1474 samples

Writing TFRecord 10 of 30...
1474 samples

Writing TFRecord 11 of 30...
1473 samples

Writing TFRecord 12 of 30...
1473 samples

Writing TFRecord 13 of 30...
1473 samples

Writing TFRecord 14 of 30...
1473 samples

Writing TFRecord 15 of 30...
1473 samples

Writing TFRecord 16 of 30...
1473 samples

Writing TFRecord 17 of 30...
1473 samples

Writing TFRecord 18 of 30...
1473 samples

Writing TFRecord 19 of 30...
1473 samples

Writing TFRecord 20 of 30...
1473 samples

Writing TFRecord 21 of 30...
1473 samples

Writing TFRecord 22 of 30...
1473 samples

Writing TFRecord 23 