In [14]:
from benatools.tf.tfrec import bytes_feature, convert
import tensorflow as tf
import numpy as np
from glob import glob
import os
import pandas as pd
import cv2
from sklearn.model_selection import KFold

In [15]:
input_folder = 'ssid'

# prepare Data
clean = np.array([path.split('/')[-1][:-4] for path in sorted(glob(os.path.join(input_folder,'clean','*')))])
noisy = np.array([path.split('/')[-1][:-4] for path in sorted(glob(os.path.join(input_folder,'noisy','*')))])
        
clean = np.array([path for path in clean if path in noisy])

assert len(clean)==len(noisy), f"Clean length {len(clean)} is not equal to Noisy length {len(noisy)}"
print(f"Noisy images {len(noisy)}  Clean images {len(clean)}")

df = pd.DataFrame({'image':[os.path.join(input_folder, 'noisy', p)+'.png' for p in noisy], 
                   'label':[os.path.join(input_folder, 'clean', p)+'.png' for p in clean]})
df

Noisy images 1280  Clean images 1280


Unnamed: 0,image,label
0,ssid/noisy/0000-0000.png,ssid/clean/0000-0000.png
1,ssid/noisy/0000-0001.png,ssid/clean/0000-0001.png
2,ssid/noisy/0000-0002.png,ssid/clean/0000-0002.png
3,ssid/noisy/0000-0003.png,ssid/clean/0000-0003.png
4,ssid/noisy/0000-0004.png,ssid/clean/0000-0004.png
...,...,...
1275,ssid/noisy/0039-0027.png,ssid/clean/0039-0027.png
1276,ssid/noisy/0039-0028.png,ssid/clean/0039-0028.png
1277,ssid/noisy/0039-0029.png,ssid/clean/0039-0029.png
1278,ssid/noisy/0039-0030.png,ssid/clean/0039-0030.png


In [16]:
def serialize_fn(data):
    x = cv2.cvtColor(cv2.imread(data[0]), cv2.COLOR_BGR2RGB)
    y = cv2.cvtColor(cv2.imread(data[1]), cv2.COLOR_BGR2RGB)

    # Create a dictionary mapping the feature name to the tf.Example-compatible data type.
    feature = {
      'x': bytes_feature(x.tobytes()),  # x file to bytes
      'y': bytes_feature(y.tobytes()),  # target
    }

    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [19]:
cv = KFold(5, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(cv.split(df)):

    convert(df.iloc[test_idx],
            folder = 'tfrec',
            file_prefix = f'ssid_fold_{fold}_',
            serialize_fn = serialize_fn,
            max_mb = 1000,
            dtype = np.int8,
            verbose = True,
            zfill=3)

File saved to tfrec/ssid_fold_0_000_256.tfrec
File saved to tfrec/ssid_fold_1_000_256.tfrec
File saved to tfrec/ssid_fold_2_000_256.tfrec
File saved to tfrec/ssid_fold_3_000_256.tfrec
File saved to tfrec/ssid_fold_4_000_256.tfrec
