# Build a Regression Model in Keras

### Download and Clean Dataset

In [14]:
import pandas as pd
import numpy as np
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column
predictors_norm = (predictors - predictors.mean()) / predictors.std()
n_cols = predictors_norm.shape[1]

In [15]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [16]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

### Import Keras¶

In [17]:
import keras

### A. Build a baseline model (5 marks) 

Use the Keras library to build a neural network with the following: 
- One hidden layer of 10 nodes, and a ReLU activation function 
- Use the adam optimizer and the mean squared error as the loss function. 
1. Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the train_test_split helper function from Scikit-learn. 
2. Train the model on the training data using 50 epochs. 
3. Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. You can use the mean_squared_error function from Scikit-learn. 
4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors. 5. Report the mean and the standard deviation of the mean squared errors.

In [18]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

In [19]:
# define regression model
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [20]:
# build model
model = regression_model()






In [24]:
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training and test sets by holding 30% of the data for testing
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3)
    
    res = model.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))
    
    mean_squared_error = res.history['val_loss'][-1]
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

Cycle #1: mean_squared_error 36831.98242819579
Cycle #2: mean_squared_error 5282.880445363066
Cycle #3: mean_squared_error 2396.8130981050263
Cycle #4: mean_squared_error 1932.9984857750555
Cycle #5: mean_squared_error 2164.8257061115746
Cycle #6: mean_squared_error 1523.3018214154783
Cycle #7: mean_squared_error 1370.467361400814
Cycle #8: mean_squared_error 1255.4746046344053
Cycle #9: mean_squared_error 1252.6869200672531
Cycle #10: mean_squared_error 1090.3929715943568
Cycle #11: mean_squared_error 1027.841434614558
Cycle #12: mean_squared_error 1108.6445016212833
Cycle #13: mean_squared_error 900.6285451747067
Cycle #14: mean_squared_error 808.091890106695
Cycle #15: mean_squared_error 789.1931016051653
Cycle #16: mean_squared_error 684.4735617035801
Cycle #17: mean_squared_error 614.1206975152963
Cycle #18: mean_squared_error 697.9337077217966
Cycle #19: mean_squared_error 588.0026586835053
Cycle #20: mean_squared_error 571.1883074812905
Cycle #21: mean_squared_error 551.20198863

In [23]:
print('The mean of the mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('The standard deviation of the mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))


The mean of the mean squared errors: 60.3748195097593
The standard deviation of the mean squared errors: 42.033134569441636


### B. Normalize the data (5 marks) 
Repeat Part A but use a normalized version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

In [26]:
n_cols = predictors_norm.shape[1]
def regression_model2():
    model2 = Sequential()
    model2.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model2.add(Dense(1))
    
    model2.compile(optimizer='adam', loss='mean_squared_error')
    return model2

model2 = regression_model2()

list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training and test sets by holding 30% of the data for testing
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3)
    res = model2.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))
    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))


Cycle #1: mean_squared_error 284.77674979299405
Cycle #2: mean_squared_error 147.54615299755702
Cycle #3: mean_squared_error 112.16345417306647
Cycle #4: mean_squared_error 91.13640463622257
Cycle #5: mean_squared_error 75.59786950268791
Cycle #6: mean_squared_error 78.07618696095489
Cycle #7: mean_squared_error 72.5763211666959
Cycle #8: mean_squared_error 72.05382584753931
Cycle #9: mean_squared_error 56.79433947319352
Cycle #10: mean_squared_error 65.83881154723923
Cycle #11: mean_squared_error 58.22125537958731
Cycle #12: mean_squared_error 47.43738694175548
Cycle #13: mean_squared_error 44.38718280977416
Cycle #14: mean_squared_error 46.89315536647167
Cycle #15: mean_squared_error 38.90766019111312
Cycle #16: mean_squared_error 37.53701408164015
Cycle #17: mean_squared_error 40.11530349632683
Cycle #18: mean_squared_error 38.3339255373069
Cycle #19: mean_squared_error 38.518517997272575
Cycle #20: mean_squared_error 40.610376302478386
Cycle #21: mean_squared_error 38.2982042553355

In [27]:
print('The mean of the mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('The standard deviation of the mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))

The mean of the mean squared errors: 51.7763349940476
The standard deviation of the mean squared errors: 39.81077073753699


### C. Increate the number of epochs (5 marks) 

Repeat Part B but use 100 epochs this time for training.

In [28]:
def regression_model3():
    model3 = Sequential()
    model3.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model3.add(Dense(1))
    
    model3.compile(optimizer='adam', loss='mean_squared_error')
    return model3

model3 = regression_model3() 

In [29]:
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training set (70%) and a test set (30%):  
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3)
    
    res = model3.fit(X_train, y_train, epochs=100, verbose=0, validation_data=(X_test, y_test))

    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

Cycle #1: mean_squared_error 168.36650910115165
Cycle #2: mean_squared_error 98.88765086942506
Cycle #3: mean_squared_error 55.46916357984821
Cycle #4: mean_squared_error 59.94760189858841
Cycle #5: mean_squared_error 48.54547406786082
Cycle #6: mean_squared_error 38.80288074086013
Cycle #7: mean_squared_error 40.47357638214012
Cycle #8: mean_squared_error 39.26551950942351
Cycle #9: mean_squared_error 39.720271064239796
Cycle #10: mean_squared_error 28.806404533509685
Cycle #11: mean_squared_error 33.96547918720924
Cycle #12: mean_squared_error 36.0389624166643
Cycle #13: mean_squared_error 36.743526273560754
Cycle #14: mean_squared_error 39.08382129360557
Cycle #15: mean_squared_error 35.89478204474094
Cycle #16: mean_squared_error 33.21608567469328
Cycle #17: mean_squared_error 37.054719153345594
Cycle #18: mean_squared_error 32.746995425918726
Cycle #19: mean_squared_error 34.60758033308011
Cycle #20: mean_squared_error 37.985461966505326
Cycle #21: mean_squared_error 32.3455417102

In [31]:
print('The mean of the mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('The standard deviation of the mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))


The mean of the mean squared errors: 38.374971489026706
The standard deviation of the mean squared errors: 21.57879486975877


### D. Increase the number of hidden layers (5 marks) 
Repeat part B but use a neural network with the following instead: 
- Three hidden layers, each of 10 nodes and ReLU activation function.

In [32]:
def regression_model4():
    model4 = Sequential()
    model4.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model4.add(Dense(10, activation='relu'))
    model4.add(Dense(10, activation='relu'))
    model4.add(Dense(1))
    
    model4.compile(optimizer='adam', loss='mean_squared_error')
    return model4
model4 = regression_model4()

In [33]:
list_of_mean_squared_error = []
for cycle in range(50):
    #Randomly split the data into a training set (70%) and a test set (30%):  
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3)
   
    res = model4.fit(X_train, y_train, epochs=50, verbose=0, validation_data=(X_test, y_test))
    mean_squared_error = res.history['val_loss'][-1]
    #Add value of mean_squared_error for every cycle in list.
    list_of_mean_squared_error.append(mean_squared_error)
    print('Cycle #{}: mean_squared_error {}'.format(cycle+1, mean_squared_error))

Cycle #1: mean_squared_error 110.22419798026965
Cycle #2: mean_squared_error 64.1749012524256
Cycle #3: mean_squared_error 42.73045767009451
Cycle #4: mean_squared_error 41.24843651731423
Cycle #5: mean_squared_error 36.87208608361895
Cycle #6: mean_squared_error 35.65919894616581
Cycle #7: mean_squared_error 34.72473376622864
Cycle #8: mean_squared_error 26.288962891958292
Cycle #9: mean_squared_error 32.55446760554144
Cycle #10: mean_squared_error 28.031074918975335
Cycle #11: mean_squared_error 27.51808809693963
Cycle #12: mean_squared_error 27.958058082555876
Cycle #13: mean_squared_error 30.16694363961328
Cycle #14: mean_squared_error 29.832561813897684
Cycle #15: mean_squared_error 29.791568570924035
Cycle #16: mean_squared_error 22.486880496867652
Cycle #17: mean_squared_error 24.317123937761128
Cycle #18: mean_squared_error 20.179107573425885
Cycle #19: mean_squared_error 25.114907952959868
Cycle #20: mean_squared_error 29.628307614125866
Cycle #21: mean_squared_error 23.648440

In [34]:
print('The mean of the mean squared errors: {}'.format(np.mean(list_of_mean_squared_error)))
print('The standard deviation of the mean squared errors: {}'.format(np.std(list_of_mean_squared_error)))


The mean of the mean squared errors: 28.309430312770857
The standard deviation of the mean squared errors: 13.904076068719386
