# Week 10 Machine Learning & Statistics Workalong

## Classification of Iris

### Package Imports

In [1]:
# For building neural networks.
import keras as kr

# For interacting with data sets.
import pandas as pd

# For encoding categorical variables.
import sklearn.preprocessing as pre

# For splitting into training and test sets.
import sklearn.model_selection as mod


Using TensorFlow backend.


## Load Data

In [2]:
# Load the iris data set from a URL.
df = pd.read_csv("https://raw.githubusercontent.com/ianmcloughlin/datasets/master/iris.csv")

In [3]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


## Inputs

In [4]:
# Separate the inputs from the rest of the variables.
inputs = df[['petal_length', 'petal_width', 'sepal_length', 'sepal_width']]

In [5]:
inputs

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width
0,1.4,0.2,5.1,3.5
1,1.4,0.2,4.9,3.0
2,1.3,0.2,4.7,3.2
3,1.5,0.2,4.6,3.1
4,1.4,0.2,5.0,3.6
5,1.7,0.4,5.4,3.9
6,1.4,0.3,4.6,3.4
7,1.5,0.2,5.0,3.4
8,1.4,0.2,4.4,2.9
9,1.5,0.1,4.9,3.1


## Encoded outputs
$$
\begin{align*}
  setosa      \rightarrow  [1,0,0] \\
  versicolor  \rightarrow  [0,1,0] \\
  virginica   \rightarrow  [0,0,1]
\end{align*}
$$

In [6]:
# Encode the classes as above.
encoder = pre.LabelBinarizer() # LabelBinarizer like OneHotEncoder
encoder.fit(df['class'])
outputs = encoder.transform(df['class'])

outputs

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0,

## Idea
The neural network will turn four floating point inputs into three "floating point" outputs.

$$ [5.1, 3.5, 1.4, 0.2] \rightarrow [0.8, 0.19, 0.01] $$
$$ [5.1, 3.5, 1.4, 0.2] \rightarrow [1, 0, 0] $$


## Build Model

In [7]:
# Start a neural network, building it by layers.
model = kr.models.Sequential()

# Add a hidden layer with x neurons and an input layer with 4.
model.add(kr.layers.Dense(units=30, activation='relu', input_dim=4))
# Add a three neuron output layer.
model.add(kr.layers.Dense(units=3, activation='softmax'))

# Build the graph.
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

## Split

In [8]:
# Split the inputs and outputs into training and test sets.
inputs_train, inputs_test, outputs_train, outputs_test = mod.train_test_split(inputs, outputs, test_size=0.2)

In [9]:
inputs_test.iloc[0]

petal_length    4.2
petal_width     1.3
sepal_length    5.6
sepal_width     2.7
Name: 94, dtype: float64

In [10]:
# Stupid prediction as the model has not yet been trained
model.predict(inputs_test.as_matrix()[0:1])

  """Entry point for launching an IPython kernel.


array([[0.09315881, 0.81978244, 0.0870588 ]], dtype=float32)

## Train

In [11]:
# Train the neural network.
model.fit(inputs_train, outputs_train, epochs=15, batch_size=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x26d3c1eceb8>

## Predict

In [12]:
model.predict(inputs_test.as_matrix()[0:1])

  """Entry point for launching an IPython kernel.


array([[0.08379745, 0.5316977 , 0.3845049 ]], dtype=float32)

In [13]:
# Have the network predict the classes of the test inputs.
predictions = model.predict(inputs_test)
predictions_labels = encoder.inverse_transform(predictions) # Unencode
predictions_labels

array(['versicolor', 'setosa', 'versicolor', 'virginica', 'setosa',
       'virginica', 'setosa', 'setosa', 'virginica', 'versicolor',
       'setosa', 'versicolor', 'virginica', 'virginica', 'versicolor',
       'virginica', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'setosa', 'setosa', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'versicolor', 'setosa', 'setosa', 'setosa'],
      dtype='<U10')

## Evaluate

In [14]:
# Compare the predictions to the actual classes.
predictions_labels == encoder.inverse_transform(outputs_test)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [15]:
(predictions_labels == encoder.inverse_transform(outputs_test)).sum()

27

## Whitening

In [16]:
import sklearn.decomposition as dec

In [17]:
# Pre-component Analysis
pca = dec.PCA(n_components=4, whiten=True)
pca.fit(inputs_train)
# Whiten the dataset
inputs_train_white = pd.DataFrame(pca.transform(inputs_train), columns=inputs_train.columns)
inputs_train_white

Unnamed: 0,petal_length,petal_width,sepal_length,sepal_width
0,0.187146,1.445908,-0.128590,-1.811704
1,0.247343,1.054208,1.536132,-1.721308
2,0.577086,0.405157,0.162445,0.977871
3,-1.498508,1.013567,0.199452,-0.068258
4,-1.419334,0.158521,0.865250,-0.314005
5,-1.360898,-0.270391,-0.350761,0.278249
6,1.676656,-0.964249,-1.997965,-0.858986
7,-0.045082,1.563044,0.957627,-0.083311
8,0.903693,-0.849588,0.443797,3.378778
9,1.145723,-0.615288,1.814397,1.577036


In [18]:
# Start a neural network, building it by layers.
model = kr.models.Sequential()

# Add a hidden layer with x neurons and an input layer with 4.
model.add(kr.layers.Dense(units=30, activation='relu', input_dim=4))
# Add a three neuron output layer.
model.add(kr.layers.Dense(units=3, activation='softmax'))

# Build the graph.
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [19]:
# Train the neural network.
model.fit(inputs_train_white, outputs_train, epochs=15, batch_size=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x26d3d65eb00>

In [20]:
# Have the network predict the classes of the test inputs.
predictions = model.predict(pca.transform(inputs_test)) # Need to whiten training inputs
predictions_labels = encoder.inverse_transform(predictions)
predictions_labels

array(['versicolor', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'virginica', 'setosa', 'setosa', 'virginica', 'virginica',
       'setosa', 'virginica', 'virginica', 'virginica', 'versicolor',
       'virginica', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'virginica', 'versicolor', 'versicolor',
       'virginica', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [21]:
(predictions_labels == encoder.inverse_transform(outputs_test)).sum()

26

## End