In [1]:
import re, math, os, cv2, random, io
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

from PIL import Image
from urllib.request import urlopen

### Notes

The following notebook is used to create TFrecords of the Kaokore Faces Dataset. I had to use web scraping to retrive these from individual image URL's. I did this so that I can train a CycleGAN on TPU's that generates and disriminates real and fake faces.

Check out the main model notebook here -> [Cycle GAN](https://www.kaggle.com/brendanartley/cyclegan-kaokore-model)

--

Datasets:

- [Arnaud58's Flickr Faces Data](https://www.kaggle.com/arnaud58/flickrfaceshq-dataset-ffhq) - 52,000 png images of faces scraped from Flickr with size 512x512

- [Kaokore Dataset](https://github.com/rois-codh/kaokore) - 8,000 faces from japanese paintings in the 16th century with size 256x256

-- 

In [2]:
#setting seeds for reproducability
SEED = 3141

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(SEED) 

### Reading Kaokore Image URL's

In [3]:
my_file = open("../input/kaokoregithubrepo/kaokore-master/dataset_v1.2/urls.txt", "r")
lines = [line for line in my_file.readlines()]

for x, url in enumerate(lines):
    lines[x] = url[:-1]
    
#creating metadata for images
train_df = pd.DataFrame(lines, columns=['urls'])
train_df = train_df.reset_index(drop=False)
train_df = train_df.rename(columns={"index": "row_id"})
    
print('Number of Images: {}'.format(len(lines)))

Number of Images: 8848


### Variables + Split

In [4]:
IMG_SIZE = 256
IMG_QUALITY = 100
N_FILES = 8 #number of TFrecords created
IMG_PER_TFREC = len(lines) / N_FILES
HEIGHT, WIDTH = (256, 256)

In [5]:
#splitting images into random TFrecords 'reproducable with seed'
folds = KFold(n_splits=N_FILES, shuffle=True, random_state=SEED)
train_df['file']=-1

for fold_n, (train_idx, val_idx) in enumerate(folds.split(train_df)):
    print('File: %s has %s samples' % (fold_n+1, len(val_idx)))
    train_df['file'].loc[val_idx] = fold_n

File: 1 has 1106 samples
File: 2 has 1106 samples
File: 3 has 1106 samples
File: 4 has 1106 samples
File: 5 has 1106 samples
File: 6 has 1106 samples
File: 7 has 1106 samples
File: 8 has 1106 samples


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


### Tfrecord Functions

In [6]:
def _bytes_feature(value):
    #Returns a bytes_list from a string / byte.
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    #Returns a float_list from a float / double.
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    #Returns an int64_list from a bool / enum / int / uint.
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [7]:
def serialize_example(image, image_name):
    feature = {
      'image': _bytes_feature(image),
      'image_id': _bytes_feature(image_name),
      }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [8]:
train_df.to_csv('train.csv',index=False)

### Scrape + Write TFrecords

In [9]:
for tfrec_num in range(N_FILES):
    print('\nWriting TFRecord %i of %i...'%((tfrec_num+1), N_FILES))
    samples = train_df[train_df['file'] == tfrec_num]
    n_samples = len(samples)
    print(f'{n_samples} samples')
    with tf.io.TFRecordWriter('KaokoreFaces%.2i-%i.tfrec'%(tfrec_num, IMG_PER_TFREC)) as writer:
        for row in samples.itertuples():
            image_id = "kaokore_{}".format(row.row_id)
            try:
                image = Image.open(urlopen(lines[row.row_id]))
                if image.size == (256, 256):
    
                    byteIO = io.BytesIO()
                    image.save(byteIO, format='JPEG')
                    img_byte_Arr = byteIO.getvalue()
                    
                    #cant figure out how to encode image and then write to tfrecords
                    example = serialize_example(img_byte_Arr, str.encode(image_id))
                    writer.write(example)

            except:
                print("bad url found:")


Writing TFRecord 1 of 8...
1106 samples

Writing TFRecord 2 of 8...
1106 samples

Writing TFRecord 3 of 8...
1106 samples

Writing TFRecord 4 of 8...
1106 samples

Writing TFRecord 5 of 8...
1106 samples

Writing TFRecord 6 of 8...
1106 samples

Writing TFRecord 7 of 8...
1106 samples

Writing TFRecord 8 of 8...
1106 samples
