# Peer-graded Assignment: Build a Regression Model in Keras

# Part A - Build a baseline model 

Use the Keras library to build a neural network with the following:
- One hidden layer of 10 nodes, and a ReLU activation function
- Use the adam optimizer and the mean squared error as the loss function.
1. Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the train_test_splithelper function from Scikit-learn.
2. Train the model on the training data using 50 epochs.
3. Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. You can use the mean_squared_error function from Scikit-learn.
4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
5. Report the mean and the standard deviation of the mean squared errors.

In [1]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Perpare data

In [4]:
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

concrete_data_columns = concrete_data.columns
predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column
predictors.head()
target.head()

predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

n_cols = predictors_norm.shape[1] # number of predictors

In [5]:
predictors.head()
target.head()
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


### Establish model

In [6]:
# define regression model
# One hidden layer of 10 nodes, and a ReLU activation function
# Use the adam optimizer and the mean squared error as the loss function.

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


In [26]:
# build the model
model = regression_model()

a_results= [];

# Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
for i in range(50):
    # slow, so we add a print to indicate that we are still running
    print(i) 
    # Split the data, holding 30% of the data for testing
    x_train, x_test, y_train, y_test = train_test_split(predictors, target, test_size=0.30, shuffle= True)
    # train 
    model.fit(x_train, y_train, epochs=50, verbose=0)
    # evaluate
    score = model.evaluate(x_test, y_test, verbose=0)
    a_results.append(score)

a_results

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


[312.8182069846342,
 109.62822664131238,
 104.39523990717521,
 96.67395997278899,
 83.59552513048487,
 68.46969767450129,
 61.363684644976864,
 47.63056350757389,
 47.38710661459123,
 47.18154792415286,
 43.281295640568906,
 54.036223994875414,
 40.53880058214502,
 45.44369253757316,
 49.84480280706412,
 44.79949593158216,
 45.5800992231153,
 43.96236365975685,
 42.37126949767079,
 49.950644125830394,
 47.47710279199298,
 53.30354862151408,
 49.31288869172624,
 43.83629726669163,
 46.87818165504431,
 51.27525318479075,
 52.482123754556895,
 51.468994881342915,
 47.43500143501751,
 77.77640368020265,
 47.0137430579917,
 46.607024010716906,
 48.94574833842157,
 44.25172097783258,
 55.23900420920363,
 57.061827329370196,
 51.7921081345444,
 48.94589100069213,
 51.43438979954396,
 46.64414608902916,
 49.38746001110879,
 48.47934446365702,
 54.44034699869002,
 42.46160522016507,
 39.531101615683546,
 49.630448887649095,
 56.73860770907603,
 44.63266712176375,
 49.444137499170395,
 52.933852

In [27]:
# Report the mean and the standard deviation of the mean squared errors.
a_mean = np.mean( a_results )
a_stddev = np.std( a_results )
print( '-'*80 )
print( f' For part A, mean is {a_mean} , stddev is {a_stddev} ' )
print( '-'*80 )

--------------------------------------------------------------------------------
 For part A, mean is 58.916268356718284 , stddev is 39.24250209779374 
--------------------------------------------------------------------------------


# Part B - Normalize the data

Repeat Part A but use a **normalized** version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

How does the mean of the mean squared errors compare to that from Step A?

In [19]:
b_results= [];

# Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
for i in range(50):
    # slow, so we add a print to indicate that we are still running
    print(i) 
    # Split the data, holding 30% of the data for testing
    x_train, x_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.30, shuffle= True)
    # train 
    model.fit(x_train, y_train, epochs=50, verbose=0)
    # evaluate
    score = model.evaluate(x_test, y_test, verbose=0)
    b_results.append(score)

b_results

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


[339.96640389945514,
 150.82398522633179,
 84.92537825007268,
 70.68456395467122,
 72.11245196691223,
 56.679659105813236,
 42.12045615853615,
 52.85162722640053,
 46.34334317303013,
 44.12514867443097,
 40.37616430980102,
 42.69070978380716,
 36.67284555033959,
 37.97436549362627,
 40.90555547819169,
 37.158125892811995,
 36.70102944111747,
 35.27932426921758,
 32.010561575781566,
 31.640631197339893,
 34.8559324641058,
 35.747983160913954,
 31.285426408341788,
 31.948412861253065,
 31.99126151618834,
 33.3791749330786,
 31.64372214530278,
 28.036259216012308,
 30.463006442418763,
 34.96166543435896,
 27.331315568349893,
 31.26811279988212,
 32.33292287067302,
 31.572376757377945,
 29.805836007818822,
 32.460326247230704,
 28.951421712980302,
 30.61505538051568,
 30.462466971388142,
 31.921025779255,
 28.24339362332736,
 26.051319813651176,
 29.403087998671054,
 29.392124842671514,
 30.29295981200382,
 32.70064581096365,
 29.698785232494565,
 25.07513619086503,
 28.317676865167215,
 2

In [20]:
# Report the mean and the standard deviation of the mean squared errors.
b_mean = np.mean( b_results )
b_stddev = np.std( b_results )
print( '-'*80 )
print( f' For part B, mean is {b_mean} , stddev is {b_stddev} ' )
print( '-'*80 )

--------------------------------------------------------------------------------
 For part B, mean is 44.99501803574053 , stddev is 46.59437401248902 
--------------------------------------------------------------------------------


# Part C - Increate the number of epochs

Repeat Part B but use **100 epochs** this time for training.
How does the mean of the mean squared errors compare to that from Step B?

In [22]:
c_results= [];

# Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
for i in range(50):
    # slow, so we add a print to indicate that we are still running
    print(i)
    # Split the data, holding 30% of the data for testing
    x_train, x_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.30, shuffle= True)
    # train 
    model.fit(x_train, y_train, epochs=100, verbose=0)
    # evaluate
    score = model.evaluate(x_test, y_test, verbose=0)
    c_results.append(score)

c_results

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


[21.51963337648262,
 24.499458930253212,
 23.953483544507073,
 21.858882706527957,
 25.987320649971082,
 24.959736777740776,
 24.048364571383086,
 26.032496677633244,
 25.341517800266303,
 24.5500610993518,
 22.805844921124407,
 25.861222668373085,
 24.092188671568838,
 21.226689983725933,
 25.345024158267915,
 25.600166425735818,
 21.81816331005405,
 22.75658378477621,
 26.35387050295339,
 23.83398016525318,
 24.34605555241162,
 20.820474766604722,
 25.65071151634636,
 20.682758374507372,
 27.845660836179665,
 26.923069296531306,
 23.483855596252244,
 25.208579578831742,
 24.21463849706557,
 23.210838842546284,
 22.451352412646642,
 18.715248749865683,
 20.69169778113998,
 21.679396095399333,
 21.202471282489864,
 26.103799431069383,
 20.215403461147666,
 24.361562858507472,
 22.238550698487117,
 25.34331676643643,
 25.017144144545867,
 26.721869002653943,
 19.84603171672636,
 22.82432017897325,
 27.40529921756979,
 22.450104457274996,
 23.446726925550543,
 26.833463637959994,
 24.025

In [23]:
# Report the mean and the standard deviation of the mean squared errors.
c_mean = np.mean( c_results )
c_stddev = np.std( c_results )
print( '-'*80 )
print( f' For part C, mean is {b_mean} , stddev is {b_stddev} ' )
print( '-'*80 )

--------------------------------------------------------------------------------
 For part C, mean is 44.99501803574053 , stddev is 46.59437401248902 
--------------------------------------------------------------------------------


# Part D. Increase the number of hidden layers

Repeat part B but use a neural network with the following instead:

- **Three hidden layers***, each of 10 nodes and ReLU activation function.

How does the mean of the mean squared errors compare to that from Step B?

In [24]:
def d_regression_model():
    # create model
    model = Sequential()
    # Three hidden layers
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

d_model = d_regression_model()

d_results= [];

# Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
for i in range(50):
    # slow, so we add a print to indicate that we are still running
    print(i)
    # Split the data, holding 30% of the data for testing
    x_train, x_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.30, shuffle= True)
    # train 
    model.fit(x_train, y_train, epochs=50, verbose=0)
    # evaluate
    score = model.evaluate(x_test, y_test, verbose=0)
    d_results.append(score)

d_results

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


[21.929436337986424,
 23.2154896498498,
 24.53581448588942,
 26.18675870185531,
 25.927092549098735,
 26.45963047237458,
 24.705719858311525,
 28.96744632412315,
 22.35291255716367,
 23.53125798740819,
 20.685249032326116,
 19.953190133795385,
 22.976332605849578,
 23.990239448917723,
 15.501729847929624,
 23.38435577343197,
 18.391993957815817,
 26.42341870551742,
 20.680395280659006,
 24.701862810499073,
 25.115012783062884,
 19.051896623037393,
 24.138151656462536,
 22.211666421982848,
 22.461515994519477,
 25.163497480373938,
 18.336507488608746,
 23.460275020414187,
 26.29338026818334,
 21.764635752705694,
 26.01736552661291,
 22.7546773126596,
 21.432366935952196,
 23.36327616990963,
 23.160560469025548,
 21.76359442059662,
 22.750370642899696,
 26.7673185959603,
 24.996167414396712,
 19.88294258241129,
 21.26754528650574,
 22.3320786682919,
 19.33339933438594,
 25.971072014867296,
 22.328778072468285,
 21.45904310158541,
 20.674063951066397,
 27.722431645810026,
 20.888859251942

In [25]:
# Report the mean and the standard deviation of the mean squared errors.
d_mean = np.mean( d_results )
d_stddev = np.std( d_results )
print( '-'*80 )
print( f' For part D, mean is {d_mean} , stddev is {d_stddev} ' )
print( '-'*80 )

--------------------------------------------------------------------------------
 For part D, mean is 22.93449048607095 , stddev is 2.694321805291805 
--------------------------------------------------------------------------------


# Thanks

Thanks for reviewing~