In [3]:
from keras.datasets import boston_housing

Using TensorFlow backend.


In [4]:
(train_data, train_targets), (test_data, test_targets) = boston_housing.load_data()
print(train_data)
# "you subtract the mean of the feature and divide by the standard deviation, so that the feature is centered around 0 and has a unit standard deviation"
# Called z score.

# >>> a = np.array([[1, 2], [3, 4]])
# >>> np.mean(a, axis=0)   -> Calculates mean.
# array([2., 3.])          -> (1+3)/2
# >>> np.std(a, axis=0)    -> Calculates standard deviation of axis 0.
# array([1.,  1.])         -> ... TODO: add exact calculation example
#
# We should normalize our data, otherwise it is going to be hard for the network to learn. 
# Since some values of the input layer are much larger then others and therefore its harder for the model to adjust the weights to keep the wide spread in "mind".
# We use here https://en.wikipedia.org/wiki/Standard_score
# To archive this we need to aim for 1. and 2. (See further below)

# 1. Goal is to have a "expected value" of 0. https://en.wikipedia.org/wiki/Expected_value
# So basically this will move the x center of input deviation graph to the 0 for all data arrays.
mean = train_data.mean(axis=0)
train_data -= mean
# 2. Goal is to have a "variance" of 1. https://en.wikipedia.org/wiki/Variance
# So basically this will normalize the X axis of the input deviation graph.
# X axis of the input deviation graph will look then like ...-2σ -1σ 0σ 1σ 2σ... σ = standard deviation for all data arrays
std = train_data.std(axis=0) 
train_data /= std

test_data -= mean 
test_data /= std
print(train_data)
print(train_data.std(axis=0)) # As you see its all 1 (Goal 2.)

# Basically we have at the end for every data array the same input model (scale) in form of an array of x, where x represents with x*σ+mean the old value. (σ and mean is different for each axis)
# "normalization helps the backpropagation algorithm converge faster"
# To compare this with the MNIST example.
# Y axis of the input deviation graph is count.
# X axis of the input deviation graph is the gray scale.

# Q: Can't we just map all axis into an range of [-1,+1]? (Min-Max method)?
# A: Yes, but min max method is better for non bell shaped distribution, like MNIST. Because TODO:

[[1.23247e+00 0.00000e+00 8.14000e+00 ... 2.10000e+01 3.96900e+02
  1.87200e+01]
 [2.17700e-02 8.25000e+01 2.03000e+00 ... 1.47000e+01 3.95380e+02
  3.11000e+00]
 [4.89822e+00 0.00000e+00 1.81000e+01 ... 2.02000e+01 3.75520e+02
  3.26000e+00]
 ...
 [3.46600e-02 3.50000e+01 6.06000e+00 ... 1.69000e+01 3.62250e+02
  7.83000e+00]
 [2.14918e+00 0.00000e+00 1.95800e+01 ... 1.47000e+01 2.61950e+02
  1.57900e+01]
 [1.43900e-02 6.00000e+01 2.93000e+00 ... 1.56000e+01 3.76700e+02
  4.38000e+00]]
[[-0.27224633 -0.48361547 -0.43576161 ...  1.14850044  0.44807713
   0.8252202 ]
 [-0.40342651  2.99178419 -1.33391162 ... -1.71818909  0.43190599
  -1.32920239]
 [ 0.1249402  -0.48361547  1.0283258  ...  0.78447637  0.22061726
  -1.30850006]
 ...
 [-0.40202987  0.99079651 -0.7415148  ... -0.71712291  0.07943894
  -0.67776904]
 [-0.17292018 -0.48361547  1.24588095 ... -1.71818909 -0.98764362
   0.42083466]
 [-0.40422614  2.04394792 -1.20161456 ... -1.30866202  0.23317118
  -1.15392266]]
[1. 1. 1. 1. 1. 