# Deep Learning Fundamentals - LU04 Lab Exercise

## 4.1 Import required package and load data from file into pandas dataframe

In [15]:
import pandas as pd
import numpy as np
from keras import models
from keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Import the data using the file path
data = pd.read_csv('Ames_Housing_Sales.csv', sep=',')

In [24]:
#enter 'data' to show the before data
data

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,SalePrice
0,856,854,0,,3,1Fam,TA,No,706,0,...,0,Pave,8,856,AllPub,0,2003,2003,2008,208500
1,1262,0,0,,3,1Fam,TA,Gd,978,0,...,0,Pave,6,1262,AllPub,298,1976,1976,2007,181500
2,920,866,0,,3,1Fam,TA,Mn,486,0,...,0,Pave,6,920,AllPub,0,2001,2002,2008,223500
3,961,756,0,,3,1Fam,Gd,No,216,0,...,0,Pave,7,756,AllPub,0,1915,1970,2006,140000
4,1145,1053,0,,4,1Fam,TA,Av,655,0,...,0,Pave,9,1145,AllPub,192,2000,2000,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,968,0,0,Grvl,4,1Fam,,,0,0,...,0,Pave,5,656,AllPub,0,1927,2007,2007,64500
1355,962,830,0,,3,1Fam,,,0,0,...,0,Pave,8,936,AllPub,0,2000,2000,2008,186500
1356,1126,0,0,,3,1Fam,TA,No,936,0,...,0,Pave,5,1126,AllPub,295,1977,1977,2006,160000
1357,1537,0,0,,3,1Fam,,,0,0,...,0,Pave,7,1319,AllPub,0,1962,2005,2008,174000


## 4.2 Data Preparation
Extract the label column containing the SalePrice and remove from the dataset

In [17]:
y_col_name = 'SalePrice'
y_data = data[y_col_name]

x_data = data.drop(y_col_name, axis=1)

In [25]:
y_data

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1354     64500
1355    186500
1356    160000
1357    174000
1358    120500
Name: SalePrice, Length: 1359, dtype: int64

In [26]:
x_data

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,,3,1Fam,TA,No,706,0,...,WD,0,Pave,8,856,AllPub,0,2003,2003,2008
1,1262,0,0,,3,1Fam,TA,Gd,978,0,...,WD,0,Pave,6,1262,AllPub,298,1976,1976,2007
2,920,866,0,,3,1Fam,TA,Mn,486,0,...,WD,0,Pave,6,920,AllPub,0,2001,2002,2008
3,961,756,0,,3,1Fam,Gd,No,216,0,...,WD,0,Pave,7,756,AllPub,0,1915,1970,2006
4,1145,1053,0,,4,1Fam,TA,Av,655,0,...,WD,0,Pave,9,1145,AllPub,192,2000,2000,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,968,0,0,Grvl,4,1Fam,,,0,0,...,WD,0,Pave,5,656,AllPub,0,1927,2007,2007
1355,962,830,0,,3,1Fam,,,0,0,...,WD,0,Pave,8,936,AllPub,0,2000,2000,2008
1356,1126,0,0,,3,1Fam,TA,No,936,0,...,WD,0,Pave,5,1126,AllPub,295,1977,1977,2006
1357,1537,0,0,,3,1Fam,,,0,0,...,COD,0,Pave,7,1319,AllPub,0,1962,2005,2008


Perform One-Hot Encoding on all categorical data columns

In [18]:
# OneHot Encode categorical data
categorical_data = x_data.select_dtypes(include=['object']).copy()
for col in categorical_data.columns:
    categorical_data[col] = categorical_data[col].astype('category')
    #astype refer to oneHot encoding
categorical_data = pd.get_dummies(categorical_data)
print(categorical_data)

      Alley_Grvl  Alley_None  Alley_Pave  BldgType_1Fam  BldgType_2fmCon  \
0              0           1           0              1                0   
1              0           1           0              1                0   
2              0           1           0              1                0   
3              0           1           0              1                0   
4              0           1           0              1                0   
...          ...         ...         ...            ...              ...   
1354           1           0           0              1                0   
1355           0           1           0              1                0   
1356           0           1           0              1                0   
1357           0           1           0              1                0   
1358           0           1           0              1                0   

      BldgType_Duplex  BldgType_Twnhs  BldgType_TwnhsE  BsmtCond_Fa  \
0               

Normalize data scales for numerical data and the labels

In [27]:
# Standard Scale numerial feature data
numerical_data = x_data.select_dtypes(include=['float64', 'int64']).copy()
data_tmp = numerical_data.values #returns a numpy array
std_scaler = StandardScaler()
data_tmp = std_scaler.fit_transform(data_tmp)
numerical_data = pd.DataFrame(data_tmp, columns=numerical_data.columns)

# Standard Scale numerial label data
y_tmp = pd.DataFrame(y_data).values #returns a numpy array
y_scaler = MinMaxScaler()
y_tmp = y_scaler.fit_transform(y_tmp)
y_tmp = y_tmp.reshape(-1)
y_data = pd.Series(y_tmp)

In [34]:
numerical_data

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,OverallCond,OverallQual,PoolArea,ScreenPorch,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,-0.826699,1.135796,-0.117593,0.166928,0.542926,-0.290667,1.111135,-0.248513,-0.950275,-0.345643,...,-0.531239,0.606778,-0.071211,-0.280331,0.913378,-0.499969,-0.773100,1.024926,0.863140,0.146693
1,0.220579,-0.805878,-0.117593,0.166928,1.133743,-0.290667,-0.834783,3.920983,-0.648062,-0.345643,...,2.245505,-0.136785,-0.071211,-0.280331,-0.348086,0.428718,1.606504,0.104254,-0.456328,-0.605595
2,-0.661611,1.163079,-0.117593,0.166928,0.065058,-0.290667,1.111135,-0.248513,-0.309765,-0.345643,...,-0.531239,0.606778,-0.071211,-0.280331,-0.348086,-0.353575,-0.773100,0.956728,0.814271,0.146693
3,-0.555852,0.912981,-0.117593,0.166928,-0.521415,-0.290667,1.111135,-0.248513,-0.070701,4.173057,...,-0.531239,0.606778,-0.071211,-0.280331,0.282646,-0.728709,-0.773100,-1.975783,-0.749544,-1.357883
4,-0.081223,1.588246,-0.117593,1.441395,0.432147,-0.290667,1.111135,-0.248513,-0.183467,-0.345643,...,-0.531239,1.350341,-0.071211,-0.280331,1.544110,0.161091,0.760068,0.922629,0.716533,0.146693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,-0.537795,-0.805878,-0.117593,1.441395,-0.990594,-0.290667,-0.834783,-0.248513,0.190915,-0.345643,...,0.394343,-1.623911,-0.071211,-0.280331,-0.978818,-0.957450,-0.773100,-1.566595,1.058617,-0.605595
1355,-0.553272,1.081229,-0.117593,0.166928,-0.990594,-0.290667,1.111135,-0.248513,0.822404,-0.345643,...,-0.531239,-0.136785,-0.071211,-0.280331,0.913378,-0.316976,-0.773100,0.922629,0.716533,0.146693
1356,-0.130234,-0.805878,-0.117593,0.166928,1.042514,-0.290667,1.111135,-0.248513,-0.860062,-0.345643,...,-0.531239,-0.880348,-0.071211,-0.280331,-0.978818,0.117631,1.582548,0.138353,-0.407459,-1.357883
1357,0.929942,-0.805878,-0.117593,0.166928,-0.990594,-0.290667,1.111135,-0.248513,1.686190,-0.345643,...,3.171087,-0.136785,-0.071211,-0.280331,0.282646,0.559100,-0.773100,-0.373132,0.960879,0.146693


In [35]:
y_data

0       0.240644
1       0.203128
2       0.261487
3       0.145464
4       0.298308
          ...   
1354    0.040558
1355    0.210075
1356    0.173254
1357    0.192707
1358    0.118369
Length: 1359, dtype: float64

In [28]:
# Combine categorial and numerical data
x_data = pd.concat([numerical_data, categorical_data], axis=1)
x_col_name = x_data.columns
x_col_count = len(x_col_name)

In [32]:
x_col_count

294

## 4.3 Train, Validation, Test Dataset Split

### 4.3.1 Method 1

In [37]:
# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

network = models.Sequential()

# This is 1 input layer of x_col_count nodes and 1 output layer of 1 node
#network.add(layers.Dense(1, activation='sigmoid', input_shape=(x_col_count,)))

# >>>>>>>>> the following set is a sample if to create multiple layer >>>>>>>>>>>>>
#example to create multiple layers. the following example is 
# 1 input layer of x_col_count nodes 
# 2 hidden layers of 5 and 3 nodes respectively
# 1 output layer of 1 node on predicted sales pricing
# use relu first then sigmoid. Relu is more superior
network.add(layers.Dense(5, activation='relu', input_shape=(x_col_count,)))
network.add(layers.Dense(3, activation='relu'))
network.add(layers.Dense(1, activation='sigmoid'))
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Observe the use loss function in the codes below
network.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mse'])

# Observe the setting of shuffle to True
# Play around the epochs, batch_size to see the effect 
# (Try to achieve <0.001 loss)
network.fit(X_train, y_train, epochs=20, batch_size=8, validation_split=0.2, shuffle=True)
# batch determine the number of iteration in each epoch. The smaller the number, the larger the number of iteration

test_loss, test_error = network.evaluate(X_test, y_test)
print('Test loss: {:.4f}'.format(test_loss))
print('Test error: {:.4f}'.format(test_error))

#if loss or mse keep going down, it mean the model is getting better, not overfitted yet

X_train shape: (951, 294)
y_train shape: (951,)
X_test shape: (408, 294)
y_test shape: (408,)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.0042
Test error: 0.0042


### 4.3.2 Method 2

In [6]:
# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_val shape: {}'.format(X_val.shape))
print('y_val shape: {}'.format(y_val.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

network = models.Sequential()

# This is 1 input layer of x_col_count nodes and 1 output later of 1 node
#network.add(layers.Dense(1, activation='sigmoid', input_shape=(x_col_count,)))

# >>>>>>>>> the following set is a sample if to create multiple layer >>>>>>>>>>>>>
#example to create multiple layers. the following example is 
# 1 input layer of x_col_count nodes 
# 2 hidden layers of 5 and 3 nodes respectively
# 1 output layer of 1 node on predicted sales pricing
network.add(layers.Dense(5, activation='relu', input_shape=(x_col_count,)))
network.add(layers.Dense(3, activation='relu'))
network.add(layers.Dense(1, activation='sigmoid'))
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Observe the use loss function in the codes below
network.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mae'])

# Observe the setting of shuffle to True
# Play around the epochs, batch_size to see the effect 
# (Try to achieve <0.001 loss)
history = network.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_val, y_val), shuffle=True)

test_loss, test_mae = network.evaluate(X_test, y_test)
print('Test loss: {:.4f}'.format(test_loss))
print('Test mae: {:.4f}'.format(test_mae))

NameError: name 'x_data' is not defined

### 4.3.3 K-fold cross validation using scikit-learn

The following is a simple k-fold implementation. all data preparation will still be required prior this cell

In [7]:
network2 = models.Sequential()

# >>>>>>>>> the following set is a sample if to create multiple layer >>>>>>>>>>>>>
#example to create multiple layers. the following example is 
# 1 input layer of x_col_count nodes 
# 2 hidden layers of 5 and 3 nodes respectively
# 1 output layer of 1 node on predicted sales pricing
network2.add(layers.Dense(5, activation='relu', input_shape=(x_col_count,)))
network2.add(layers.Dense(3, activation='relu'))
network2.add(layers.Dense(1, activation='sigmoid'))
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Observe the use loss function in the codes below
network2.compile(optimizer='sgd', loss='mean_squared_error', metrics=['mae'])

NameError: name 'x_col_count' is not defined

In [8]:
from numpy import array

# implement k-fold using scikit learn library. you can refer to the link below on the api
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
from sklearn.model_selection import KFold

# prepare cross validation of 5 fold on the data and further shuffle the data. You can modify this to 
# see the data set used in the print out 
kf = KFold(n_splits=5, random_state=40, shuffle=True)

for i, (train_index, test_index) in enumerate(kf.split(x_data)):
    firstTrain = train_index[0] 
    totalTrainRec = len(train_index)
    firstTest = test_index[0] 
    totalTestRec = len(test_index)
    
    print('********************************** Running fold '+ str(i))  
    print('=====Training set=======')
    print('Train set from ' + str(firstTrain) + ' with total of  ' + str(totalTrainRec))
    print(train_index)

    print('=====Testing set=======')
    print('Test set from ' + str(firstTest) + ' with total of  ' + str(totalTestRec))
    print(test_index)
    K_train, K_label = x_data.iloc[train_index], y_data.iloc[train_index]
    K_test, Ktest_label = x_data.iloc[test_index], y_data.iloc[test_index]
    
    network2.fit(K_train, K_label, epochs=5, batch_size=8)
    
    test_loss, test_mae = network.evaluate(K_test, Ktest_label)
    print('Test error: {}'.format(test_mae))

NameError: name 'x_data' is not defined