# Chapter 13: Loading and Preprocessing Data with TensorFlow - Notebook Reproduksi Kode

Bab ini membahas cara efisien memuat dan memproses dataset besar menggunakan
TensorFlow's `tf.data` API, yang sangat penting untuk membangun pipeline data
yang skalabel dan efisien untuk Deep Learning.

Kita akan melihat:
- tf.data API untuk membuat pipeline input.
- Membangun pipeline sederhana (from_tensor_slices, batch, shuffle, prefetch).
- Transformasi data (map, filter, interleave).
- Menguraikan data TensorFlow Record.
- tf.io.TFRecordWriter dan TFRecordDataset.
- Input preprocessor menggunakan Keras Preprocessing Layers.

In [3]:
# -*- coding: utf-8 -*-
"""
Chapter 13: Loading and Preprocessing Data with TensorFlow - Reproduksi Kode Lengkap
"""

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Setup
np.random.seed(42)
tf.random.set_seed(42)

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "tf_data"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

# Load California Housing dataset
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# --- 1. tf.data API ---

print("--- tf.data API: Membangun Pipeline Sederhana ---")
dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train))
for item in dataset.take(3):
    print(item)

batch_size = 32
shuffle_buffer_size = 100
n_epochs = 5

dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_size).prefetch(1)

print("\nMelatih model dengan tf.data.Dataset...")
model_tf_data = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train_scaled.shape[1:]),
    keras.layers.Dense(1)
])
model_tf_data.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=0.001))
history_tf_data = model_tf_data.fit(dataset, epochs=n_epochs, verbose=0)
print("Model berhasil dilatih dengan tf.data.Dataset.")
print(f"Loss terakhir: {history_tf_data.history['loss'][-1]:.4f}")

# --- Transformasi Chain (map, prefetch) ---
print("\n--- tf.data API: Chain Transformasi ---")
dataset_complex = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train)).shuffle(100).batch(batch_size)

def add_extra_features(X_batch, y_batch):
    return tf.concat([X_batch, X_batch[:, :5]], axis=1), y_batch

dataset_complex = dataset_complex.map(add_extra_features).prefetch(1)

for X_batch_sample, y_batch_sample in dataset_complex.take(1):
    print(f"Bentuk X_batch setelah map: {X_batch_sample.shape}")
    print(f"Bentuk y_batch setelah map: {y_batch_sample.shape}")

# --- Filter item
def filter_high_prices(X, y):
    return y > 2.0

dataset_filter_item = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train))
dataset_filter_item = dataset_filter_item.filter(filter_high_prices).batch(batch_size).prefetch(1)

count_filtered = 0
for _, _ in dataset_filter_item:
    count_filtered += 1
print(f"Jumlah batch setelah filter (y > 2.0 per item): {count_filtered}")

# --- Interleave dari file CSV
print("\n--- tf.data API: Interleave ---")

def make_dummy_csv_files(num_files=5, num_rows_per_file=100, filename_prefix="my_data"):
    dummy_dir = os.path.join(PROJECT_ROOT_DIR, "dummy_data")
    os.makedirs(dummy_dir, exist_ok=True)
    file_paths = []
    for i in range(num_files):
        filename = f"{filename_prefix}_{i}.csv"
        filepath = os.path.join(dummy_dir, filename)
        df_dummy = pd.DataFrame(np.random.rand(num_rows_per_file, 3), columns=['col1', 'col2', 'col3'])
        df_dummy.to_csv(filepath, index=False)
        file_paths.append(filepath)
    print(f"Dibuat {num_files} file CSV dummy di {dummy_dir}")
    return file_paths

dummy_csv_files = make_dummy_csv_files()

def parse_csv(filepath):
    return tf.data.TextLineDataset(filepath).skip(1)

filepath_dataset = tf.data.Dataset.from_tensor_slices(dummy_csv_files)

interleaved_dataset = filepath_dataset.interleave(
    parse_csv, cycle_length=2, block_length=1
)

print("\nInterleaved data (beberapa baris):")
for line in interleaved_dataset.take(5):
    print(line.numpy().decode("utf-8"))

# --- TFRecord Serialization ---
print("\n--- Menguraikan Data TensorFlow Record ---")

def serialize_example(x, y):
    feature = {
        "features": tf.train.Feature(float_list=tf.train.FloatList(value=x.ravel())),
        "label": tf.train.Feature(float_list=tf.train.FloatList(value=[y]))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

tfrecord_path = os.path.join(PROJECT_ROOT_DIR, "my_housing_data.tfrecord")
with tf.io.TFRecordWriter(tfrecord_path) as writer:
    for x, y in zip(X_train_scaled, y_train):
        writer.write(serialize_example(x, y))
print("Data berhasil ditulis ke TFRecord.")

# Parsing TFRecord
def parse_tfrecord_example(example_proto):
    feature_description = {
        "features": tf.io.FixedLenFeature([X_train_scaled.shape[1]], tf.float32),
        "label": tf.io.FixedLenFeature([1], tf.float32)
    }
    parsed_features = tf.io.parse_single_example(example_proto, feature_description)
    return parsed_features["features"], parsed_features["label"]

tfrecord_dataset = tf.data.TFRecordDataset(tfrecord_path)
parsed_tfrecord_dataset = tfrecord_dataset.map(parse_tfrecord_example).batch(3).prefetch(1)

print("\nMembaca dan menguraikan TFRecord (beberapa contoh):")
for features_batch, labels_batch in parsed_tfrecord_dataset.take(1):
    print(f"Bentuk fitur: {features_batch.shape}")
    print(f"Bentuk label: {labels_batch.shape}")
    print(f"Contoh fitur (pertama): {features_batch[0].numpy()}")
    print(f"Contoh label (pertama): {labels_batch[0].numpy()}")

# --- Keras Preprocessing Layers ---
print("\n--- Keras Preprocessing Layers ---")

model_preprocess_layer = keras.models.Sequential([
    keras.layers.Normalization(axis=-1),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1)
])

print("Mengadaptasi Normalization layer...")
model_preprocess_layer.layers[0].adapt(X_train)  # menggunakan data non-scaled
print("Normalization layer berhasil diadaptasi.")

model_preprocess_layer.compile(loss="mse", optimizer="adam")
print("\nMelatih model dengan Normalization layer...")
history_preprocess = model_preprocess_layer.fit(X_train, y_train, epochs=5, verbose=0)
print("Model dengan Normalization layer berhasil dilatih.")
print(f"Loss terakhir: {history_preprocess.history['loss'][-1]:.4f}")

print("\n--- Selesai Reproduksi Kode Chapter 13 ---")


--- tf.data API: Membangun Pipeline Sederhana ---
(<tf.Tensor: shape=(8,), dtype=float64, numpy=
array([-0.19397883, -1.07781319, -0.94338545,  0.01485314,  0.02073335,
       -0.57291624,  0.92926047, -1.42215523])>, <tf.Tensor: shape=(), dtype=float64, numpy=1.442>)
(<tf.Tensor: shape=(8,), dtype=float64, numpy=
array([ 0.75198318, -1.868895  ,  0.40547793, -0.23327682,  1.8614649 ,
        0.20516532, -0.91654738,  1.09666969])>, <tf.Tensor: shape=(), dtype=float64, numpy=1.687>)
(<tf.Tensor: shape=(8,), dtype=float64, numpy=
array([-0.41469108,  0.02970134,  0.81808819,  1.05678372, -0.08786707,
       -0.29983271,  1.30872858, -1.697027  ])>, <tf.Tensor: shape=(), dtype=float64, numpy=1.621>)

Melatih model dengan tf.data.Dataset...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model berhasil dilatih dengan tf.data.Dataset.
Loss terakhir: 0.5577

--- tf.data API: Chain Transformasi ---
Bentuk X_batch setelah map: (32, 13)
Bentuk y_batch setelah map: (32,)
Jumlah batch setelah filter (y > 2.0 per item): 155

--- tf.data API: Interleave ---
Dibuat 5 file CSV dummy di ./dummy_data

Interleaved data (beberapa baris):
0.3745401188473625,0.9507143064099162,0.7319939418114051
0.0516817211686077,0.531354631568148,0.5406351216101065
0.5986584841970366,0.15601864044243652,0.15599452033620265
0.6374299014982066,0.7260913337226615,0.9758520794625346
0.05808361216819946,0.8661761457749352,0.6011150117432088

--- Menguraikan Data TensorFlow Record ---
Data berhasil ditulis ke TFRecord.

Membaca dan menguraikan TFRecord (beberapa contoh):
Bentuk fitur: (3, 8)
Bentuk label: (3, 1)
Contoh fitur (pertama): [-0.19397883 -1.0778131  -0.9433854   0.01485314  0.02073335 -0.57291627
  0.9292605  -1.4221553 ]
Contoh label (pertama): [1.442]

--- Keras Preprocessing Layers ---
Mengad