In [1]:
import time
import os
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense

In [3]:
COL_NAME_CEMENT = "Cement"
COL_NAME_BLAST_FURNACE_SLAG = "Blast Furnace Slag"
COL_NAME_FLY_ASH = "Fly Ash"
COL_NAME_WATER = "Water"
COL_NAME_SUPERPLASTICIZER = "Superplasticizer"
COL_NAME_COARSE_AGGREGATE = "Coarse Aggregate"
COL_NAME_FINE_AGGREGATE = "Fine Aggregate"
COL_NAME_AGE = "Age"
COL_NAME_STRENGTH = "Strength"

COL_NAME_EXPERIMENT = "Experiment"
COL_NAME_MSE = "Mean MSE"
COL_NAME_RMSE = "Std Deviation MSE"

# This dataframe contains three columns: 
# name_of_experiments, mse, rmse
header_of_df_mse_and_rmse = [COL_NAME_EXPERIMENT, COL_NAME_MSE, COL_NAME_RMSE]
df_mse_and_rmse = pd.DataFrame(columns=header_of_df_mse_and_rmse, data=[])


def get_round(score, num_of_digits=2):
    """Get round with given number of decimal digits 
    """
    return round(score, num_of_digits)


def get_mean(list_of_mse_scores):
    """Get mean
    """
    if list_of_mse_scores:
        return get_round(np.mean(list_of_mse_scores))
    return None


def get_standard_deviation(list_of_mse_scores):
    """Get standard deviation
    """
    if list_of_mse_scores:
        return get_round(np.std(list_of_mse_scores))
    return None
def build_model_with_one_hidden_layer(num_of_features=3):
    """ Building baseline model that contains:

    + One hidden layer of 10 nodes, and a ReLU activation function.
    + Use the adam optimizer and the mean squared error as the loss function.
    """    
    # Create model
    model = Sequential()

    model.add(Dense(10, activation="relu", input_shape=(num_of_features,)))
    model.add(Dense(1))

    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


def build_model_with_three_hidden_layers(num_of_features=3):
    """ Building model that contains:
    
    + Three hidden layers, each of 10 nodes and ReLU activation function.    
    + Use the adam optimizer and the mean squared error as the loss function.
    """
    
    # Create model
    model = Sequential()

    model.add(Dense(10, activation="relu", input_shape=(num_of_features,)))
    model.add(Dense(10, activation="relu"))
    model.add(Dense(10, activation="relu"))
    model.add(Dense(1))

    # Compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model
def get_mean_squared_error(compiled_model, X, y, epochs=50, verbose=1):
    """Get report (dataframe) of two metrics: 
    The mean and the standard deviation of the mean squared errors
    """   
    
    # 1. Randomly split the data into a training and test sets by holding 30% 
    # of the data for testing. You can use the train_test_split helper function 
    # from Scikit-learn.    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)   
    print("Training set: ", X_train.shape, y_train.shape)
    print("Testing set: ", X_test.shape, y_test.shape)
    
    
    # 2. Train the model on the training data using 50 epochs.
    # Note that: given model which is compiled
    # Fit the built model with training set
    model.fit(X_train, y_train, epochs=epochs, verbose=verbose)    

    # 3. Evaluate the model on the test data and compute the mean squared error 
    # between the predicted concrete strength and the actual concrete strength. 
    # You can use the mean_squared_error function from Scikit-learn.    
    y_hat = model.predict(X_test)    
    mse = mean_squared_error(y_test, y_hat)
    
    # Return the mean squared error
    return mse
def get_mean_and_std_of_mse(df_X, 
                            df_y, 
                            compiled_model,                
                            max_iteration=50, 
                            epochs=50, 
                            verbose=0):
    """Generate the mean and the standard deviation of the mean squared errors 
    """
    # Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.    
    list_of_mean_squared_errors = []
    for i in range(max_iteration):
        start_time = time.time()
        print("-" * 36)
        print("Processing current number of iteration : {}".format(i+1))        
        mse = get_mean_squared_error(compiled_model, df_X, df_y, epochs=epochs, verbose=verbose)
        list_of_mean_squared_errors.append(mse)
        print("Duration (seconds): {}".format(time.time()-start_time))
    # end for

    print("Finished - {} times.\nAnd the list of mean squared errors : {}".format(max_iteration,
                                                                                  list_of_mean_squared_errors))
    mean_mse = get_mean(list_of_mean_squared_errors)
    std_mse = get_standard_deviation(list_of_mean_squared_errors)

    print("-" * 72)
    print("The mean and the standard deviation of the mean squared errors are: {} and {}, respectively".format(
           mean_mse, std_mse))
    
    return mean_mse, std_mse


def get_report(name_of_experiment, mean_mse, std_mse):
    """Get report (dataframe) of two metrics: 
    The mean and the standard deviation of the mean squared errors
    """
    values = [[name_of_experiment, mean_mse, std_mse]]

    return pd.DataFrame(columns=header_of_df_mse_and_rmse, data=values)

In [7]:
file_input_path = "concrete_data.csv"

In [8]:
if os.path.exists(file_input_path):
    print("We will load the data from file '{}' to dataframe.".format(file_input_path))
    df = pd.read_csv(file_input_path, header=0)
else:
    print("File not found : {}".format(file_input_path))
    df = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')

We will load the data from file 'concrete_data.csv' to dataframe.


In [9]:
df.columns

Index(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age', 'Strength'],
      dtype='object')

In [10]:
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              1030 non-null   float64
 1   Blast Furnace Slag  1030 non-null   float64
 2   Fly Ash             1030 non-null   float64
 3   Water               1030 non-null   float64
 4   Superplasticizer    1030 non-null   float64
 5   Coarse Aggregate    1030 non-null   float64
 6   Fine Aggregate      1030 non-null   float64
 7   Age                 1030 non-null   int64  
 8   Strength            1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [12]:
print("(row, column) = {}".format(df.shape))

(row, column) = (1030, 9)


In [13]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

## Normalizing Input Data

In [14]:

list_of_column_names = df.columns
list_of_column_names

Index(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age', 'Strength'],
      dtype='object')

## Splitting predictors and target

In [15]:
list_of_col_names_predictors = [x for x in list_of_column_names 
                                if x != COL_NAME_STRENGTH]

In [16]:
list_of_col_names_predictors

['Cement',
 'Blast Furnace Slag',
 'Fly Ash',
 'Water',
 'Superplasticizer',
 'Coarse Aggregate',
 'Fine Aggregate',
 'Age']

In [17]:
df_predictors = df[list_of_col_names_predictors]

In [19]:
df_target = df[[COL_NAME_STRENGTH]]

In [20]:

df_predictors.head(3)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270


In [21]:
df_target.head(3)

Unnamed: 0,Strength
0,79.99
1,61.89
2,40.27


## Applying normalizing method

In [22]:
df_predictors_norm = (df_predictors - df_predictors.mean())/df_predictors.std()

In [23]:
df_predictors_norm.head(3)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134


## Building and Training with baseline model

In [24]:
num_of_features = len(df.columns) - 1
print("Number of features for input layer : ", num_of_features)

Number of features for input layer :  8


In [25]:
max_iteration = 50
epochs = 50
verbose = 0

# Get the compiled model
model = build_model_with_one_hidden_layer(num_of_features=num_of_features)

mean_mse, std_mse = get_mean_and_std_of_mse(df_predictors, 
                                            df_target, 
                                            model, 
                                            max_iteration=max_iteration, 
                                            epochs=epochs, verbose=verbose)

------------------------------------
Processing current number of iteration : 1
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.3918001651763916
------------------------------------
Processing current number of iteration : 2
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8207714557647705
------------------------------------
Processing current number of iteration : 3
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9384942054748535
------------------------------------
Processing current number of iteration : 4
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8003332614898682
------------------------------------
Processing current number of iteration : 5
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8796916007995605
------------------------------------
Processing current number of iteration : 6


Duration (seconds): 0.8031373023986816
------------------------------------
Processing current number of iteration : 46
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9336540699005127
------------------------------------
Processing current number of iteration : 47
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8516185283660889
------------------------------------
Processing current number of iteration : 48
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8632793426513672
------------------------------------
Processing current number of iteration : 49
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8507273197174072
------------------------------------
Processing current number of iteration : 50
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9042677879333496
Finished - 50 times.
And the list of

## Mean and Standard deviation on mean squared error

In [26]:
name_of_experiment = "Baseline-Raw (50 epochs)"

# Report the mean and the standard deviation of the mean squared errors
df_result_baseline = get_report(name_of_experiment, mean_mse, std_mse)
df_result_baseline

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Baseline-Raw (50 epochs),70.07,96.53


In [27]:
# Concat baseline dataframe into result
df_mse_and_rmse = pd.concat([df_mse_and_rmse, df_result_baseline], axis=0)

# Review the result dataframe
df_mse_and_rmse.reset_index(drop=True)

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Baseline-Raw (50 epochs),70.07,96.53


## B - Experiment with Normalized data

In [28]:
#Before normalization
df_predictors.head(3)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270


In [30]:
#After normalization
df_predictors_norm.head(3)

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134


## Building and Training with the baseline model after normalizing the data with 50 epochs

In [31]:
max_iteration = 50
epochs = 50
verbose = 0

# Get the compiled model
model = build_model_with_one_hidden_layer(num_of_features=num_of_features)

mean_mse, std_mse = get_mean_and_std_of_mse(df_predictors_norm, 
                                            df_target, 
                                            model, 
                                            max_iteration=max_iteration, 
                                            epochs=epochs, verbose=verbose)

------------------------------------
Processing current number of iteration : 1
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.0679576396942139
------------------------------------
Processing current number of iteration : 2
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9514567852020264
------------------------------------
Processing current number of iteration : 3
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9589664936065674
------------------------------------
Processing current number of iteration : 4
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9689643383026123
------------------------------------
Processing current number of iteration : 5
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8818895816802979
------------------------------------
Processing current number of iteration : 6


Duration (seconds): 0.9088623523712158
------------------------------------
Processing current number of iteration : 46
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8682608604431152
------------------------------------
Processing current number of iteration : 47
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8677258491516113
------------------------------------
Processing current number of iteration : 48
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.858684778213501
------------------------------------
Processing current number of iteration : 49
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8986110687255859
------------------------------------
Processing current number of iteration : 50
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8659780025482178
Finished - 50 times.
And the list of 

## Mean and Standard deviation on mean squared error

In [32]:
name_of_experiment = "Normalized-1 Hidden Layers(50 epochs)"

# Report the mean and the standard deviation of the mean squared errors
df_result_baseline = get_report(name_of_experiment, mean_mse, std_mse)
df_result_baseline

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Normalized-1 Hidden Layers(50 epochs),50.28,60.88


In [33]:
# Concat baseline dataframe into result
df_mse_and_rmse = pd.concat([df_mse_and_rmse, df_result_baseline], axis=0)

# Review the result dataframe
df_mse_and_rmse.reset_index(drop=True)

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Baseline-Raw (50 epochs),70.07,96.53
1,Normalized-1 Hidden Layers(50 epochs),50.28,60.88


## C. Increate the number of epochs

## Building and Training with the baseline model after normalizing the data with 100 epochs

In [34]:
max_iteration = 50
epochs = 100
verbose = 0

# Get the compiled model
model = build_model_with_one_hidden_layer(num_of_features=num_of_features)

mean_mse, std_mse = get_mean_and_std_of_mse(df_predictors_norm, 
                                            df_target, 
                                            model, 
                                            max_iteration=max_iteration, 
                                            epochs=epochs, verbose=verbose)

------------------------------------
Processing current number of iteration : 1
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.854759931564331
------------------------------------
Processing current number of iteration : 2
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.8492217063903809
------------------------------------
Processing current number of iteration : 3
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.74550199508667
------------------------------------
Processing current number of iteration : 4
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.7516765594482422
------------------------------------
Processing current number of iteration : 5
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.7615890502929688
------------------------------------
Processing current number of iteration : 6
Tra

Duration (seconds): 1.625398874282837
------------------------------------
Processing current number of iteration : 46
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.6332547664642334
------------------------------------
Processing current number of iteration : 47
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.57027268409729
------------------------------------
Processing current number of iteration : 48
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.6649587154388428
------------------------------------
Processing current number of iteration : 49
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.774498701095581
------------------------------------
Processing current number of iteration : 50
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.7289631366729736
Finished - 50 times.
And the list of mea

In [35]:
ame_of_experiment = "Normalized-1 Hidden Layers(100 epochs)"

# Report the mean and the standard deviation of the mean squared errors
df_result_baseline = get_report(name_of_experiment, mean_mse, std_mse)
df_result_baseline

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Normalized-1 Hidden Layers(50 epochs),43.05,15.6


In [36]:
# Concat baseline dataframe into result
df_mse_and_rmse = pd.concat([df_mse_and_rmse, df_result_baseline], axis=0)

# Review the result dataframe
df_mse_and_rmse.reset_index(drop=True)

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Baseline-Raw (50 epochs),70.07,96.53
1,Normalized-1 Hidden Layers(50 epochs),50.28,60.88
2,Normalized-1 Hidden Layers(50 epochs),43.05,15.6


## D. Increase the number of hidden layers

In [37]:
max_iteration = 50
epochs = 50
verbose = 0

# Get the compiled model
model = build_model_with_three_hidden_layers(num_of_features=num_of_features)

mean_mse, std_mse = get_mean_and_std_of_mse(df_predictors_norm, 
                                            df_target, 
                                            model, 
                                            max_iteration=max_iteration, 
                                            epochs=epochs, 
                                            verbose=verbose)

------------------------------------
Processing current number of iteration : 1
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.2007529735565186
------------------------------------
Processing current number of iteration : 2
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9726855754852295
------------------------------------
Processing current number of iteration : 3
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.8849258422851562
------------------------------------
Processing current number of iteration : 4
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9277732372283936
------------------------------------
Processing current number of iteration : 5
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 1.0347826480865479
------------------------------------
Processing current number of iteration : 6


Duration (seconds): 1.1023859977722168
------------------------------------
Processing current number of iteration : 46
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9445722103118896
------------------------------------
Processing current number of iteration : 47
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9217174053192139
------------------------------------
Processing current number of iteration : 48
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9192147254943848
------------------------------------
Processing current number of iteration : 49
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9472312927246094
------------------------------------
Processing current number of iteration : 50
Training set:  (721, 8) (721, 1)
Testing set:  (309, 8) (309, 1)
Duration (seconds): 0.9909088611602783
Finished - 50 times.
And the list of

In [38]:
name_of_experiment = "Normalized-3 Hidden Layers(50 epochs)"

# Report the mean and the standard deviation of the mean squared errors
df_result_baseline = get_report(name_of_experiment, mean_mse, std_mse)
df_result_baseline

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Normalized-3 Hidden Layers(50 epochs),39.52,10.52


In [39]:
# Concat baseline dataframe into result
df_mse_and_rmse = pd.concat([df_mse_and_rmse, df_result_baseline], axis=0)

# Review the result dataframe
df_mse_and_rmse.reset_index(drop=True)

Unnamed: 0,Experiment,Mean MSE,Std Deviation MSE
0,Baseline-Raw (50 epochs),70.07,96.53
1,Normalized-1 Hidden Layers(50 epochs),50.28,60.88
2,Normalized-1 Hidden Layers(50 epochs),43.05,15.6
3,Normalized-3 Hidden Layers(50 epochs),39.52,10.52
