In [1]:
# Here is a demonstration of the Softmax function: suppose we had the following output
# vector from a multi-ouput neural network (each value represents the output of the 
# sigmoid activation function for the output node).
import math

z = [.10, .20, .30, .40, .10, .20, .30]
z_exp = [math.exp(i) for i in z]
print([round(i, 2) for i in z_exp])


[1.11, 1.22, 1.35, 1.49, 1.11, 1.22, 1.35]


In [2]:
sum_z_exp = sum(z_exp)
print(round(sum_z_exp, 2))

8.84


In [3]:
softmax = [round(i / sum_z_exp, 3) for i in z_exp]
print(softmax)

[0.125, 0.138, 0.153, 0.169, 0.125, 0.138, 0.153]


In [5]:
# Here the softmax function would pick the output with value 0.4. 

In [6]:
# Here we build a neural network to do regression. First we recall the linear regression model ...

In [1]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
diabetes = datasets.load_diabetes()
train_x, test_x, train_y, test_y = train_test_split(diabetes.data, diabetes.target,
                                                        test_size=0.25)

In [10]:
regr = linear_model.LinearRegression()
regr.fit(train_x, train_y)
diabetes_y_pred = regr.predict(test_x)

In [11]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(test_y, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(test_y, diabetes_y_pred))

Mean squared error: 2578.07
Variance score: 0.50


In [3]:
# Now let's build a MLP regressor ...
from sklearn.neural_network import MLPRegressor

In [16]:
MLPregr = MLPRegressor(hidden_layer_sizes=(10,20,10), max_iter=100000)

In [17]:
MLPregr.fit(train_x,train_y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [19]:
predictions = MLPregr.predict(test_x)

In [20]:
predictions[0:2]

array([ 100.66072266,  127.15233723])

In [24]:
# this is the MSE for the model
sum((predictions - test_y)**2)/len(predictions)

2607.2969735016177

In [25]:
# Comparable to Linear regression. Let's see if scaling the data helps ...
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [27]:
scaler.fit(train_x)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [28]:
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)

In [29]:
MLPregr.fit(train_x,train_y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 20, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [30]:
predictions = MLPregr.predict(test_x)

In [31]:
sum((predictions - test_y)**2)/len(predictions)

2760.7787921044674

In [32]:
# No help. Let's use a more complicated model ...
MLPregr = MLPRegressor(hidden_layer_sizes=(20,100,20), max_iter=100000)

In [33]:
MLPregr.fit(train_x,train_y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 100, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [34]:
predictions = MLPregr.predict(test_x)

In [35]:
sum((predictions - test_y)**2)/len(predictions)

5259.994382830796

In [4]:
# Worse - we have overfit the model! Let's go simpler
MLPregr = MLPRegressor(hidden_layer_sizes=(5,), max_iter=100000)

In [37]:
MLPregr.fit(train_x,train_y)

MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [38]:
predictions = MLPregr.predict(test_x)

In [39]:
sum((predictions - test_y)**2)/len(predictions)

2689.0875235466269

In [40]:
# Still no better than Linear regression.

In [5]:
# here we do a 5-fold CV using the Neural Network - should give a better understanding of the 
# true MSE for this model.
from sklearn.model_selection import KFold

In [6]:
kf = KFold(n_splits=5)

In [7]:
kf.get_n_splits(diabetes.data)

5

In [9]:
# Now we compute the models and average the MSEs:
MSE = 0.0
for train_index, test_index in kf.split(diabetes.data):
    MLPregr = MLPRegressor(hidden_layer_sizes=(10,20,10), max_iter=100000)
    MLPregr.fit(diabetes.data[train_index],diabetes.target[train_index])
    predictions = MLPregr.predict(diabetes.data[test_index])
    MSE += sum((predictions - diabetes.target[test_index])**2)/len(predictions)
    
print(MSE/5.0)
    

2988.59725992
