In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler

SEED = 2017

Using TensorFlow backend.


In [2]:
# Data can be downloaded at https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

In [3]:
data = pd.read_csv('Data/winequality-red.csv', sep=';')
y = data['quality']
X = data.drop(['quality'], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [5]:
print('Average quality training set: {:.4f}'.format(y_train.mean()))
X_train.head()

Average quality training set: 5.6231


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1140,7.3,0.4,0.3,1.7,0.08,33.0,79.0,0.9969,3.41,0.65,9.5
920,9.6,0.41,0.37,2.3,0.091,10.0,23.0,0.99786,3.24,0.56,10.5
1198,7.7,0.26,0.26,2.0,0.052,19.0,77.0,0.9951,3.15,0.79,10.9
423,10.5,0.24,0.47,2.1,0.066,6.0,24.0,0.9978,3.15,0.9,11.0
601,13.2,0.46,0.52,2.2,0.071,12.0,35.0,1.0006,3.1,0.56,9.0


In [6]:
scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))


In [7]:
# Predict the mean quality of the training data for each validation input
print('MSE:', np.mean((y_test - ([y_train.mean()] * y_test.shape[0])) ** 2).round(4))

MSE: 0.594


In [8]:
model = Sequential()
# First hidden layer with 100 hidden units
model.add(Dense(200, input_dim=X_train.shape[1], activation='relu')) 
# Second hidden layer with 50 hidden units
model.add(Dense(25, activation='relu'))
# Output layer
model.add(Dense(1, activation='linear'))
# Set optimizer
opt = Adam()
# Compile model
model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])

In [9]:
callbacks = [
             EarlyStopping(monitor='val_acc', patience=20, verbose=2),
             ModelCheckpoint('checkpoints/multi_layer_best_model.h5', monitor='val_acc', save_best_only=True, verbose=0)
            ]

In [10]:
batch_size = 64
n_epochs = 5000

In [11]:
model.fit(X_train.values, y_train, batch_size=64, epochs=n_epochs, validation_split=0.2,     
             verbose=2,
              validation_data=(X_test.values, y_test),
             callbacks=callbacks)

Train on 1279 samples, validate on 320 samples
Epoch 1/5000
0s - loss: 19.5184 - acc: 0.0078 - val_loss: 10.1719 - val_acc: 0.0500
Epoch 2/5000
0s - loss: 4.9020 - acc: 0.1384 - val_loss: 3.5278 - val_acc: 0.2031
Epoch 3/5000
0s - loss: 2.8680 - acc: 0.2353 - val_loss: 2.5458 - val_acc: 0.2625
Epoch 4/5000
0s - loss: 2.1114 - acc: 0.2807 - val_loss: 2.1699 - val_acc: 0.2844
Epoch 5/5000
0s - loss: 1.8369 - acc: 0.2971 - val_loss: 1.9367 - val_acc: 0.2906
Epoch 6/5000
0s - loss: 1.6484 - acc: 0.3135 - val_loss: 1.7661 - val_acc: 0.3063
Epoch 7/5000
0s - loss: 1.4913 - acc: 0.3268 - val_loss: 1.6222 - val_acc: 0.3281
Epoch 8/5000
0s - loss: 1.3515 - acc: 0.3315 - val_loss: 1.5042 - val_acc: 0.3531
Epoch 9/5000
0s - loss: 1.2412 - acc: 0.3542 - val_loss: 1.3874 - val_acc: 0.3688
Epoch 10/5000
0s - loss: 1.1294 - acc: 0.3690 - val_loss: 1.2921 - val_acc: 0.3875
Epoch 11/5000
0s - loss: 1.0416 - acc: 0.3753 - val_loss: 1.1757 - val_acc: 0.4000
Epoch 12/5000
0s - loss: 0.9525 - acc: 0.3886 -

<keras.callbacks.History at 0x7f6007e79f60>

In [12]:
best_model = model
best_model.load_weights('checkpoints/multi_layer_best_model.h5')
best_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Evaluate on test set
score = best_model.evaluate(X_test.values, y_test, verbose=0)
print('Test accuracy: %.2f%%' % (score[1]*100))

# Test accuracy: 65.62% 
# Benchmark accuracy on dataset 62.4%

Test accuracy: 65.00%
