# Build your own neural network

Now it is all up to you to build your own neural network from scratch and apply a hyperparameter search.

## The data

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('cars.csv')
df.head()

In [None]:
df = df.fillna(0)

y = df[['price']]
X = df.drop(['price'],axis=1)

In [None]:
for column in X.columns:
    if X[column].dtype == np.object:
        print('Converting ', column)
        X = pd.concat([X,pd.get_dummies(X[column], prefix=column, drop_first=True)],axis=1).drop([column],axis=1)

## Create the model

Creating train and test set and standardising:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

X_train = StandardScaler().fit_transform(X_train)
y_train = StandardScaler().fit_transform(y_train)

X_test = StandardScaler().fit(X_train).transform(X_test)
y_test = StandardScaler().fit(y_train).transform(y_test)

### Baseline models: linear regression and random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train,np.ravel(y_train))
prediction = lr.predict(X_test)

print('RMSE linear regression:', np.sqrt(mse(y_test,prediction)))

rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,np.ravel(y_train))
prediction = rf.predict(X_test)

print('RMSE random forest:', np.sqrt(mse(y_test,prediction)))

### Create your neural network

Incorporate the following:
- 2 different kernels in hidden layer.
- 2 different output kernels.
- Different sizes for the hidden layers.
- Different number of hidden layers.
- Use 10 epochs.

Here, create your model compatible with Keras:

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import Adam

def nn_model(no_neurons,no_layers,kernel,output_kernel):
    model = Sequential()
    model.add(Dense(no_neurons,input_dim=input_dim))
    model.add(Activation(kernel))

    # Extra hidden layers
    for _ in range(0,no_layers):
        model.add(Dense(no_neurons))
        model.add(Activation(kernel))

    # Output
    model.add(Dense(output_dim))
    model.add(Activation(output_kernel))
    model.compile(optimizer=Adam(),loss='mean_squared_error',metrics=['mean_squared_error'])
        
    return model

Here, run your grid search (10 epochs):

In [None]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import GridSearchCV

input_dim = X.shape[1]
output_dim = 1

parameters = {'no_neurons':[50,100],'kernel':['relu','linear'],'output_kernel':['linear','sigmoid'],'no_layers':[1,2],'verbose':[0]} 

grid_search = GridSearchCV(KerasRegressor(nn_model), parameters, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X, y,epochs=10)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
          # The MSE is return as a negative, so we multiple it with -1 before squaring it
          % (np.sqrt(-1*mean), np.sqrt(std), params))

It seems that there is quite some difference between using the sigmiod and linear output kernel. The former gives a lower RMSE. The better results are obtained using ReLU as the activation function in the hidden layers.

Smaller hidden layers:

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

input_dim = X.shape[1]
output_dim = 1

parameters = {'no_neurons':[10,20],'kernel':['relu','linear'],'output_kernel':['linear','sigmoid'],'no_layers':[0,1],'verbose':[0]} 

grid_search = GridSearchCV(KerasClassifier(nn_model), parameters, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X, y,epochs=10)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
          # The MSE is return as a negative, so we multiple it with -1 before squaring it
          % (np.sqrt(-1*mean), np.sqrt(std), params))

We clearly have a higher error rate, meaning that a bigger network pays off in this case.

Let's try more epochs for the smaller hidden layers:

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

input_dim = X.shape[1]
output_dim = 1

parameters = {'no_neurons':[10,20],'kernel':['relu','linear'],'output_kernel':['linear','sigmoid'],'no_layers':[0,1],'verbose':[0]} 

grid_search = GridSearchCV(KerasClassifier(nn_model), parameters, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X, y,epochs=100)

means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']

print('Mean RMSE (+/- standard deviation), for parameters')
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
    print("%0.3f (+/- %0.03f) for %r"
          # The MSE is return as a negative, so we multiple it with -1 before squaring it
          % (np.sqrt(-1*mean), np.sqrt(std), params))

No difference here except for the reLU kernel based ones. we might be overfitting in the other cases.