In [26]:
import numpy as np
import matplotlib.pyplot as plt

In [27]:
np.random.seed(138)

In [28]:
## Helper function for visualization
def plot_decision_boundary(model, X, y):
    '''
    Plot the decision boundaries of a model trained on 2 dimensionnal data, as
    well as the scatter plot of the data X, using the target values in y for
    the colors.
    ----------------------
    Input
            - model : Scikit-learn style estimator, already trained, must have
                    a .predict method
            - X : Data for scatter plot, must have 2 features.
            - y : Target classes for scatter plot colors
    '''
    # Set min and max values and give it some padding
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    h = 0.01
    # Generate a grid of points with distance h between them
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
    # Predict the function value for the whole grid
    X_grid = np.c_[xx1.ravel(), xx2.ravel()]
    Z = model.predict(X_grid)
    Z = Z.reshape(xx1.shape)
    # Plot the contour and training examples
    plt.contourf(xx1, xx2, Z, cmap=plt.cm.Spectral)
    plt.ylabel('x2')
    plt.xlabel('x1')
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', cmap=plt.cm.Spectral)
    plt.show()

# Classification example

We will do first a quick example for a classification using the Scikit-learn library. In this example, we generate in a 2D plan three different "blobs" of data points.

In [29]:
from sklearn.datasets import make_blobs

In [None]:
# generate 500 samples of data points, with 2 features and 3 different blobs of points
X,y = make_blobs(500, 2, centers=3, random_state=42)

# Plot the dataset generated
plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "yo")
plt.plot(X[y==2, 0], X[y==2, 1], "bo")

## Normalizing the data

Depending on the optimization technique used for a ML algorithm, you may need to normalize the data to help the model converge during its optimization/fitting/learning process. It is not necessary for all ML algorithm, but if you are going to test a bunch of different ML models, it is better to do it straightaway.

In [31]:
from sklearn.preprocessing import StandardScaler

In [None]:
normalizer = StandardScaler()
X = normalizer.fit_transform(X) # find the mean and std for X and then normalize X

plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "yo")
plt.plot(X[y==2, 0], X[y==2, 1], "bo")

In [None]:
# You can perform the inverse transform later if you want like this
X = normalizer.inverse_transform(X)
plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "yo")
plt.plot(X[y==2, 0], X[y==2, 1], "bo")

In [None]:
# And transform new data with the previously found values for the mean and std
X = normalizer.transform(X)
plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "yo")
plt.plot(X[y==2, 0], X[y==2, 1], "bo")

## Decision Tree

To demonstrate the working here we train a decision tree model to show how it classifies the data points and how it creates the boudaries. Run this code a couple of times and observe how the boundaries change.

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score


dt = tree.DecisionTreeClassifier(max_depth = 3)
dt.fit(X, y)
y_pred = dt.predict(X)
acc = accuracy_score(y, y_pred)
print("Accuracy:", acc)

plt.figure()
plot_decision_boundary(dt, X, y)

# Neural networks

The previous example was a simple problem where each class could be linearly separated. We will now look at a different problem where it is not possible anymore to linearly separate each class. The next dataset consist of 500 data points set in a 2d plan and belonging into 2 different classes.

In [None]:
from sklearn.datasets import make_moons

In [None]:
X,y = make_moons(500, noise=0.1, random_state=42)

plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "bo")

For training a neural network, it is important to normalize the data. If the data is not normalized, the training process can have difficulty to converge to a good solution when using gradient descent to optimize the model paramaters.

In [None]:
normalizer = StandardScaler()
X = normalizer.fit_transform(X)
plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "bo")

Let's train a first neural network. We will use the MLPClassifier neural network from Scikit-learn. This model is a multi-layer perceptron where each perceptron/neuron from a specific layer is connected to all neurons of the next layer.

To modify the neural network architecture, we can modify the parameter "hidden_layer_sizes":

- hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
    The ith element represents the number of neurons in the ith
    hidden layer.
    
We can also choose the activation function applied to the neurons inside the hidden layers:
- activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
    Activation function for the hidden layer.

    - 'identity', no-op activation, useful to implement linear bottleneck,
      returns f(x) = x

    - 'logistic', the logistic sigmoid function,
      returns f(x) = 1 / (1 + exp(-x)).

    - 'tanh', the hyperbolic tan function,
      returns f(x) = tanh(x).

    - 'relu', the rectified linear unit function,
      returns f(x) = max(0, x)

In [None]:
from sklearn.neural_network import MLPClassifier

First, we use the architecture of the model from question 1. Only 2 neurons in one hidden layer, using the identity activation function (the neurons perform a linear transformation of their inputs).

In [None]:
net = MLPClassifier(hidden_layer_sizes=(2), activation="identity")
net.fit(X, y)

y_pred = net.predict(X)
acc = accuracy_score(y, y_pred)
print("Accuracy:", acc)

plt.figure()
plot_decision_boundary(net, X, y)

The separation of the two classes is linear as we can see. Next, we use a more complex architecture and the hyperbolic tangent activation function.

In [None]:
net = MLPClassifier(hidden_layer_sizes=(100, 100), activation="tanh")
net.fit(X, y)

y_pred = net.predict(X)
acc = accuracy_score(y, y_pred)
print("Accuracy:", acc)

plt.figure()
plot_decision_boundary(net, X, y)

# 1) Experiment with different architectures and activation functions

Try by yourself different number of hidden layers and neurons in each layer. Vary also the type of activation function used. Try to estimate if you are underfitting or overfitting in each case.

If you have any warning messages appearing, try to understand their reason and how to solve the issue.

# 2) Using a validation dataset to optimize the architecture

Separate your dataset into a training, a validation and a test set, then use the validation set to optimize the architecture. Try to pay attention to the computational cost of your models as well.

Then use the test set to evaluate the final architecture you have selected.

In [None]:
X,y = make_moons(1500, noise=0.3, random_state=42)

plt.plot(X[y==0, 0], X[y==0, 1], "ro")
plt.plot(X[y==1, 0], X[y==1, 1], "bo")

### Separate the dataset into a training, a validation and a test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1000, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=500, random_state=42)

print('# training samples:', len(X_train))
print('# validation samples:', len(X_val))
print('# test samples:', len(X_test))

### Normalize the data using the statistics of the training set.

In [None]:
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)

### Architecture search

Try out a couple of architectures to see which one performes the best

Why do we use the validation set here instead of the test set?

### Test accuracy

Write some code here to asses the accuracy of your best model from the previous section. Plot the boundaries, how do they look like?

## Aditional info: Tensorflow

For neural networks there are also other packages such as Pytorch or tensorflow that allow you to go more indepth with the type of architectures you want to use.
we are going to give some example code here that can be toyed with.

In [1]:
%tensorflow_version 2.x
import tensorflow as tf
import pandas as pd

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [2]:
from sklearn import datasets
X, y = datasets.load_iris(return_X_y=True, as_frame=True)

In [None]:
X

In [None]:
y

In [3]:
from sklearn.preprocessing import LabelEncoder
y_enc = LabelEncoder().fit_transform(y)
# Converting the label into a matrix form
y_label = tf.keras.utils.to_categorical(y_enc)

In [None]:
y_label

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y_label, test_size=50, random_state=138)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=25, random_state=138)

In [7]:
from sklearn.preprocessing import StandardScaler
normalizer = StandardScaler()
X_train = normalizer.fit_transform(X_train)
X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)

You can note that there are a lot of hyperparameters and settings here, what do they do?

attempt to vary the amount of layers, and the amount of nodes per layer to change the performance of the model.

In [None]:
from tensorflow import keras
NNmodel = keras.Sequential([keras.layers.Dense(100, input_dim=X_train.shape[1], activation='relu'),
                          keras.layers.Dropout(0.2), #what does this do
                          keras.layers.Dense(50, activation='relu'),
                          keras.layers.Dense(3, activation='softmax')],  )


NNmodel.compile(optimizer='adam',
                loss=keras.losses.CategoricalCrossentropy(),
                metrics=['accuracy'])
history = NNmodel.fit(X_train,y_train, epochs=50, batch_size=10, validation_data=(X_val,y_val), verbose=2)

In [None]:
pd.DataFrame(history.history).plot(figsize=(10,6))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

In [None]:
loss, accuracy = NNmodel.evaluate(X_test, y_test, verbose=1)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

When testing out this model, you will find that the performance of the model is different on every itteration, what is going on here?

Tensorflow can also be used for convolutional NN and other fun models such as LSTM.