In [2]:
# Module 4 Notes: Training Deep Neural Networks & Regularization

# TensorFlow is a framework for building and optimizing deep neural networks. It handles...

# Automatic differentiation
# Tensorflow computation (like NumPy but GPU accelerated)
# Model training and graph execution



# %pip install tensorflow

import tensorflow as tf

a = tf.constant(3)
b = tf.constant(4)
c = a + b
print(c)

# tf.constant() creates immutable tensors

# TensorFlow automatically builds a computational graph

# Operations like a + b are tracked for gradient computation

# TensorFlow automatically manages data types (float32, int32) and uses GPU if available. 

tf.Tensor(7, shape=(), dtype=int32)


In [4]:
# Trainable Weights & Model Parameters

# In Keras, every layer has weights (trainable) and sometimes biases

from tensorflow import keras
from keras import layers

layer = layers.Dense(3, input_shape=(2,))
layer.build(input_shape=(None, 2))
print(layer.trainable_weights)

# A dense layer connects every input neuron to every output neuron. 

# The layer has:

# Kernel (W): weights matrix of shape (input_dim, output_dim)
# Bias (b): vector added to each neuron output
# trainable_weights returns parameters that are updated during training. 


[<Variable path=dense_1/kernel, shape=(2, 3), dtype=float32, value=[[ 0.8272064  -0.03398919 -0.48470825]
 [-0.53453773 -0.54874766 -0.58118814]]>, <Variable path=dense_1/bias, shape=(3,), dtype=float32, value=[0. 0. 0.]>]


In [7]:
# MNIST and the Flatten Layer

from tensorflow.keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train / 225.0
X_test = X_test / 225.0

# MNIST = 70,000 grayscale images of handwritten digits (0-9).
# Divided into 60k train, 10k test.
# Scaling (0-1) range improves gradient performance. 

In [8]:
# Flatten Layer

model = keras.Sequential([
    layers.Flatten(input_shape=(28, 28)),
    layers.Dense(128, activation='relu'),
    layers.Dense(10, activation='softmax')
])

# Flatten converts each 28 x 28 image to 784 element vector
# Dense(128) hidden layer with 128 neurons and ReLU activtion
# Dense(10) output layer for 10 digits (softmax converts to probabilities)

In [10]:
# Compile and train

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test), 
                    epochs=10)

# Loss: sparse categorical cross-entropy, used when labels are integers

# Optimizer: Adam adjusts learning rates adaptively

# Epochs: Each full pass through training data



Epoch 1/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 927us/step - accuracy: 0.9265 - loss: 0.2558 - val_accuracy: 0.9587 - val_loss: 0.1365
Epoch 2/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 834us/step - accuracy: 0.9665 - loss: 0.1122 - val_accuracy: 0.9688 - val_loss: 0.1035
Epoch 3/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 905us/step - accuracy: 0.9764 - loss: 0.0775 - val_accuracy: 0.9750 - val_loss: 0.0817
Epoch 4/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 899us/step - accuracy: 0.9824 - loss: 0.0579 - val_accuracy: 0.9766 - val_loss: 0.0762
Epoch 5/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 830us/step - accuracy: 0.9863 - loss: 0.0441 - val_accuracy: 0.9725 - val_loss: 0.0850
Epoch 6/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 866us/step - accuracy: 0.9885 - loss: 0.0353 - val_accuracy: 0.9749 - val_loss: 0.0805
Epoc

In [12]:
# Evaluation

test_loss, test_acc = model.evaluate(X_test, y_test)
print(test_acc)

# Evaluates on unseen data

# Typically > 97% accuracy for this architecture

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383us/step - accuracy: 0.9720 - loss: 0.1033
0.972000002861023


In [20]:
# Regularization (from regularization.ipynb)

# Regularization reduces overfitting by penalizing large weights

 # L2 Regularization

model = keras.Sequential([
    layers.Dense(64, activation='relu',
                 kernel_regularizer=keras.regularizers.l2(0.001),
                 input_shape=(784,)),
    layers.Dense(10, activation='softmax')
])

# Adds 𝜆∑𝑤2 to loss

# Encourages smaller weights, smoother decision boundaries.


In [22]:
# Dropout

model = keras.Sequential([
    layers.Dense(64,activation='relu', input_shape=(784,)),
    layers.Dropout (0.5),
    layers.Dense(10, activation ='softmax')
])

# Drops 50% of neurons each iteration
# Prevents co-dependence between neurons
# Applied only during training, not inference

In [25]:
 # Example Initialization

initializer = keras.initializers.HeNormal()
layer = layers.Dense(64, activation='relu', kernel_initializer=initializer)

# Keeps gradient magnitude stable across layers to prevent vanishing/ecploding behavior.

In [28]:
# Advanced training techniques

# From Geron_11_training_deep_neural_networks

# Learning Rate Scheduling

lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=10000,
    decay_rate=0.9)
optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)

# Decreases learning rate over time
# Prevents overshooting as model converges



In [31]:
# Early Stopping

callback = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True)

# Stops training when validation loss doesn't improve.

# Restores the weights with the best validation score.

In [32]:
# Saving and Loading Models

model.save('model.h5')
reloaded = keras.models.load_model('model.h5')

# saves architecrue + weights +optmizer state

# HDF5 (.h5) format standard for model performance



