# Chapter 13: Loading and Preprocessing Data with TensorFlow - Notebook Reproduksi Kode

Bab ini membahas cara efisien memuat dan memproses dataset besar menggunakan
TensorFlow's `tf.data` API, yang sangat penting untuk membangun pipeline data
yang skalabel dan efisien untuk Deep Learning.

Kita akan melihat:
- tf.data API untuk membuat pipeline input.
- Membangun pipeline sederhana (from_tensor_slices, batch, shuffle, prefetch).
- Transformasi data (map, filter, interleave).
- Menguraikan data TensorFlow Record.
- tf.io.TFRecordWriter dan TFRecordDataset.
- Input preprocessor menggunakan Keras Preprocessing Layers.

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- Setup ---
np.random.seed(42)
tf.random.set_seed(42)

# --- Data Preparation ---
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)

# --- tf.data Pipeline ---
batch_size = 32
n_epochs = 5
shuffle_buffer_size = 1000

dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train))
dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_size).prefetch(1)

# --- Model ---
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=X_train_scaled.shape[1:]),
    keras.layers.Dense(1)
])

model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=0.001))

# --- Training ---
print("Melatih model dengan tf.data.Dataset...")
history = model.fit(dataset, epochs=n_epochs, verbose=1)

# --- Evaluasi ---
loss = model.evaluate(X_test_scaled, y_test)
print(f"\nLoss pada set pengujian: {loss:.4f}")


Melatih model dengan tf.data.Dataset...
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 2.8732
Epoch 2/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.7677
Epoch 3/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.6808
Epoch 4/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.6203
Epoch 5/5
[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.5940
[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5848

Loss pada set pengujian: 0.5813
