In [11]:
# Problem: internal covariate shift
# Solution: address this problem with normalizing layer inputs
# this method draws its strength from making normalization a part of the model 
# archietecture and performing the normalization for each training mini-batch.
# BN allows to use much higher learning rates and be less careful about initialization
# it alos acts as a regularizer, in some case eliminating the need for Dropout.

In [1]:
# Create a sample mini-batch (size=8 for clarity)
import numpy as np
np.random.seed(42)
mini_batch = np.array([2.1, 4.3, -1.2, 0.8, 3.5, -0.3, 1.7, 2.9])
print("Original mini-batch:", mini_batch)
print()

# Step 1: Compute mean and variance
mean = np.mean(mini_batch)
variance = np.var(mini_batch)
print(f"Step 1 - Compute Statistics:")
print(f"  Mean (μ): {mean:.3f}")
print(f"  Variance (σ²): {variance:.3f}")
print()

# Step 2: Normalize
epsilon = 1e-8
normalized = (mini_batch - mean) / np.sqrt(variance + epsilon)
print("Step 2 - Normalize:")
print("  Normalized values:", np.round(normalized, 3))
print(f"  Check - Mean of normalized: {np.mean(normalized):.3f}")
print(f"  Check - Variance of normalized: {np.var(normalized):.3f}")
print()

# Step 3: Scale and shift
gamma = 1.2  # Learned parameter
beta = 0.5   # Learned parameter
output = gamma * normalized + beta
print("Step 3 - Scale (γ) and Shift (β):")
print(f"  γ = {gamma}, β = {beta}")
print("  Final output:", np.round(output, 3))

Original mini-batch: [ 2.1  4.3 -1.2  0.8  3.5 -0.3  1.7  2.9]

Step 1 - Compute Statistics:
  Mean (μ): 1.725
  Variance (σ²): 3.102

Step 2 - Normalize:
  Normalized values: [ 0.213  1.462 -1.661 -0.525  1.008 -1.15  -0.014  0.667]
  Check - Mean of normalized: 0.000
  Check - Variance of normalized: 1.000

Step 3 - Scale (γ) and Shift (β):
  γ = 1.2, β = 0.5
  Final output: [ 0.756  2.254 -1.493 -0.13   1.709 -0.88   0.483  1.301]
