In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler

SEED = 42

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('data/winequality-red.csv', sep=';')

In [3]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
y = data['quality']
X = data.drop(['quality'], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [6]:
print('Average quality training set: {:.4f}'.format(y_train.mean()))
X_train.head()

Average quality training set: 5.6239


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
493,8.7,0.69,0.31,3.0,0.086,23.0,81.0,1.0002,3.48,0.74,11.6
354,6.1,0.21,0.4,1.4,0.066,40.5,165.0,0.9912,3.25,0.59,11.9
342,10.9,0.39,0.47,1.8,0.118,6.0,14.0,0.9982,3.3,0.75,9.8
834,8.8,0.685,0.26,1.6,0.088,16.0,23.0,0.99694,3.32,0.47,9.4
705,8.4,1.035,0.15,6.0,0.073,11.0,54.0,0.999,3.37,0.49,9.9


In [7]:
scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [8]:
# Predict the mean quality of the training data for each validation input
print('MSE:', np.mean((y_test - ([y_train.mean()] * y_test.shape[0])) ** 2).round(4))

MSE: 0.6572


In [9]:
model = Sequential()
# First hidden layer with 100 hidden units
model.add(Dense(200, input_dim=X_train.shape[1], activation='relu')) 
# Second hidden layer with 50 hidden units
model.add(Dense(25, activation='relu'))
# Output layer
model.add(Dense(1, activation='linear'))
# Set optimizer
opt = Adam()
# Compile model
model.compile(loss='mse', optimizer=opt, metrics=['accuracy'])

In [10]:
callbacks = [
             EarlyStopping(monitor='val_acc', patience=20, verbose=2),
             ModelCheckpoint('data/checkpoints/multi_layer_best_model.h5', monitor='val_acc', save_best_only=True, verbose=0)
            ]

In [11]:
batch_size = 64
n_epochs = 5000

In [12]:
model.fit(X_train.values, y_train, batch_size=64, epochs=n_epochs, validation_split=0.2,     
             verbose=2,
              validation_data=(X_test.values, y_test),
             callbacks=callbacks)

Train on 1279 samples, validate on 320 samples
Epoch 1/5000
 - 1s - loss: 25.5278 - acc: 0.0000e+00 - val_loss: 17.6565 - val_acc: 0.0000e+00
Epoch 2/5000
 - 0s - loss: 11.2190 - acc: 0.0156 - val_loss: 5.6958 - val_acc: 0.0781
Epoch 3/5000
 - 0s - loss: 3.7000 - acc: 0.1704 - val_loss: 2.7426 - val_acc: 0.2250
Epoch 4/5000
 - 0s - loss: 2.4997 - acc: 0.2635 - val_loss: 2.2036 - val_acc: 0.2719
Epoch 5/5000
 - 0s - loss: 2.0671 - acc: 0.2783 - val_loss: 1.9393 - val_acc: 0.2812
Epoch 6/5000
 - 0s - loss: 1.8566 - acc: 0.2901 - val_loss: 1.7821 - val_acc: 0.2844
Epoch 7/5000
 - 0s - loss: 1.7209 - acc: 0.3018 - val_loss: 1.6660 - val_acc: 0.2938
Epoch 8/5000
 - 0s - loss: 1.5988 - acc: 0.3120 - val_loss: 1.5481 - val_acc: 0.3000
Epoch 9/5000
 - 0s - loss: 1.4918 - acc: 0.3213 - val_loss: 1.4504 - val_acc: 0.3156
Epoch 10/5000
 - 0s - loss: 1.3951 - acc: 0.3284 - val_loss: 1.3718 - val_acc: 0.3250
Epoch 11/5000
 - 0s - loss: 1.3148 - acc: 0.3425 - val_loss: 1.2806 - val_acc: 0.3375
Epoch

Epoch 96/5000
 - 0s - loss: 0.2688 - acc: 0.7045 - val_loss: 0.3581 - val_acc: 0.5969
Epoch 00096: early stopping


<keras.callbacks.History at 0x7f8f66f2eef0>

In [None]:
best_model = model
best_model.load_weights('checkpoints/multi_layer_best_model.h5')
best_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Evaluate on test set
score = best_model.evaluate(X_test.values, y_test, verbose=0)
print('Test accuracy: %.2f%%' % (score[1]*100))

# Test accuracy: 65.62% 
# Benchmark accuracy on dataset 62.4%