# Feature Scaling

This notebook demonstrates the effect of feature scaling on the convergnce of gradient descent

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Use the normalised function to see the distribution of the different features.
# Plot the distribution of the different features before and after normalisation.

from gradient_descent import zscore_normalize_features, gradient_descent
from house_price import generate_house_price_dataset

X, y, feature_names = generate_house_price_dataset(n_samples=1000)
X_norm, mu, sigma = zscore_normalize_features(X)
plt.figure(figsize=(12, 8))
plt.subplot(2, 3, 1)
plt.plot(X[:, 0], X[:, 1], 'o')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.subplot(2, 3, 2)
plt.plot(X[:, 0], X[:, 2], 'o')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[2])
plt.subplot(2, 3, 3)
plt.plot(X[:, 0], X[:, 3], 'o')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[3])
# Now plot with X_norm
plt.subplot(2, 3, 4)
plt.plot(X_norm[:, 0], X_norm[:, 1], 'ro')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[1])
plt.subplot(2, 3, 5)
plt.plot(X_norm[:, 0], X_norm[:, 2], 'ro')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[2])
plt.subplot(2, 3, 6)
plt.plot(X_norm[:, 0], X_norm[:, 3], 'ro')
plt.xlabel(feature_names[0])
plt.ylabel(feature_names[3])
plt.show()



In [None]:
# Lets see how gradient descent converges without feature scaling. 
# The value of alpha (learning rate) has been chosen such that it does not cause the divergence in gradient
# descent. See the example in learning_rate.ipynb 
# If you see "overflow" warnings, it means that gradient descent has divered to the extent that the cost is
# becoming very large.

alpha = 1e-2
X, y, feature_names = generate_house_price_dataset(n_samples=1000)
w_in = np.zeros(X.shape[1])
b_in = 0
w, b, cost_history, _, _ = gradient_descent(X, y, w_in, b_in, alpha, 1000)
plt.plot(cost_history)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title(f'Cost vs Iteration for alpha = {alpha}')
plt.show()

In [None]:
# Now do the same with feature scaling. See how it converges much faster.
X, y, feature_names = generate_house_price_dataset(n_samples=1000)
X_norm, mu, sigma = zscore_normalize_features(X)
w_in = np.zeros(X.shape[1])
b_in = 0
w, b, cost_history, _, _ = gradient_descent(X_norm, y, w_in, b_in, alpha, 1000)
plt.plot(cost_history)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.show()