# MidTerm Exam
Deadline : 02/06/23
Amir Mobayen

## Build and train a neural network on the dataset provided using the Tensorflow-Keras framework. You should:

In [1]:
import matplotlib.pyplot as plt
# Basic packages
import numpy as np
import pandas as pd
# Tensorflow
import tensorflow as tf
from numpy import ravel
from sklearn.compose import ColumnTransformer
# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
data = pd.read_csv("heart.csv", index_col=0)
data.head(10)

Unnamed: 0_level_0,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


Small explanation about the features:
age : Age of the patient
sex : Sex of the patient
cp : Chest Pain type chest pain type
Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
trtbps : resting blood pressure (in mm Hg)chol : cholestoral in mg/dl fetched via BMI sensorfbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)restecg :

resting electrocardiographic results
Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalachh : maximum heart rate achievedexng : exercise induced angina (1 = yes; 0 = no)oldpeak : ST depression induced by exercise relative to restslp : the slope of the peak exercise ST segment (2 = upsloping; 1 = flat; 0 = downsloping)caa : number of major vessels (0-3)thall : 2 = normal; 1 = fixed defect; 3 = reversable defectTarget:
output : 0= less chance of heart attack 1= more chance of heart attack

In [3]:
# make a copy in order not to touch the original file
data_set = data.copy()

# clean input data:
# remove null values on numerical columns
data_set = data_set.fillna(data_set.median())

# remove duplicates
data_set = data_set.drop_duplicates()

# separate features and label
y = data_set.pop('output')
X = data_set
y = y.to_numpy().reshape(-1, 1)

X.shape, y.shape

((302, 12), (302, 1))

### Create a test set:
- for a regression task: use 20% of the data and set the random seed to 42
- for a classification task: use 20% of the data, set the random seed to 42 and use a stratified splitting method.

In [4]:
def split(X, y, stratify=False):
    if stratify:
        train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    else:
        train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
    train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.1 / (1 - 0.2), random_state=42)
    print(train_X.shape, train_y.shape, test_X.shape, test_y.shape, val_X.shape, val_y.shape)
    return train_X, train_y, test_X, test_y, val_X, val_y

In [5]:
# Regression Task:
reg_train_X, reg_train_y, reg_test_X, reg_test_y, reg_val_X, reg_val_y = split(X, y, False)

# classification Task:
class_train_X, class_train_y, class_test_X, class_test_y, class_val_X, class_val_y = split(X, y, True)

(210, 12) (210, 1) (61, 12) (61, 1) (31, 12) (31, 1)
(210, 12) (210, 1) (61, 12) (61, 1) (31, 12) (31, 1)


### Preprocess the data:
While preprocessing is not the subject of this exam, wrong/poor preprocessing steps will be sanctioned.

In [6]:
def preprocess_data(train_X, train_y, test_X, test_y, val_X, val_y):
    #  performs data preprocessing on the provided training, testing, and validation datasets.
    # standardize numerical features
    std = StandardScaler()
    lbl = LabelEncoder()

    # Identifies the numerical columns in the training dataset.
    num_col = train_X.select_dtypes(include='number').columns

    # applies the standardization to the numerical columns.
    ct = ColumnTransformer([
        ('num', std, num_col),
    ])

    # Standardizing the Features
    X_train_scl = ct.fit_transform(train_X)
    X_val_scl = ct.transform(val_X)
    X_test_scl = ct.transform(test_X)

    # Encoding the Target Variable
    y_train_scl = lbl.fit_transform(ravel(train_y))
    y_val_scl = lbl.transform(ravel(val_y))
    y_test_scl = lbl.transform(ravel(test_y))

    return X_train_scl, y_train_scl, X_test_scl, y_test_scl, X_val_scl, y_val_scl


reg_train_X, reg_train_y, reg_test_X, reg_test_y, reg_val_X, reg_val_y = preprocess_data(reg_train_X, reg_train_y,
                                                                                         reg_test_X, reg_test_y,
                                                                                         reg_val_X, reg_val_y)

class_train_X, class_train_y, class_test_X, class_test_y, class_val_X, class_val_y = preprocess_data(class_train_X,
                                                                                                     class_train_y,
                                                                                                     class_test_X,
                                                                                                     class_test_y,
                                                                                                     class_val_X,
                                                                                                     class_val_y)

In [7]:
CLASSIFICATION = True
if CLASSIFICATION:
    X_train_scl, y_train_scl, X_test_scl, y_test_scl, X_val_scl, y_val_scl = class_train_X, class_train_y, class_test_X, class_test_y, class_val_X, class_val_y
else:
    X_train_scl, y_train_scl, X_test_scl, y_test_scl, X_val_scl, y_val_scl = reg_train_X, reg_train_y, reg_test_X, reg_test_y, reg_val_X, reg_val_y

In [8]:
# Measuring Execution Time
from datetime import datetime


def measure_execution_time(func):
    def wrapper(*args, **kwargs):
        start_time = datetime.now()
        result = func(*args, **kwargs)
        end_time = datetime.now()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time.total_seconds()} seconds")
        return result

    return wrapper

# STUDY

In [9]:
regression_looses = [tf.keras.losses.mean_squared_error,
                     # Useful when outliers are rare but need to be taken into account
                     tf.keras.losses.mean_absolute_error,  # Useful when outliers are common,when the data is very noisy
                     tf.keras.losses.huber,
                     # Useful when outliers exist, but most of the data follows a normal distribution
                     tf.keras.losses.log_cosh,
                     # Useful when outliers exist, but most of the data follows a normal distribution
                     ]
# This is a classification problem (goal is yes or no) so using regression_looses is useless. I will keep it just for a small test if my prediction with classification was not correct !

# We will not use (Sparse) Categorical Cross-Entropy -> it's for more than two output classes

binary_loss = [tf.keras.losses.BinaryCrossentropy(),  # binary classification problems
               tf.keras.losses.hinge,  # Useful for binary classification problems with outliers or unbalanced classes
               ]

kernel_initializer = ['glorot', 'he_normal', 'lecun']
activation_function = ['relu', 'leaky_relu', 'rrelu', 'prelu', 'elu', 'selu']

optimizers = [tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.01, nesterov=True),  # Good for quallity
              tf.keras.optimizers.RMSprop,  # more robust than adagrad
              tf.keras.optimizers.Adam(learning_rate=0.01)  # combination of RMSprop and SGD -> default
              ]


# leaky relu is better than relu
# selu -> maximize out put but slow

several solutions to the vanishing and exploding gradient problem in deep neural networks:
- Using a better weight initialization method
- Using alternative activation functions
- Using gradient clipping methods
- Using batch normalization
- Building networks with skip connections
- Creating new types of neurons

In [10]:
# Glorot initialization is not good with RELU Activation!
# LeCun initialization is generally used in convolutional neural networks and paired with the SELU

# relu and its variant -> he
# selu -> lecun
# rest -> glorot

## Methods to prevent overfitting
# EARLY STOPPING
# REGULARIZATION:
# penalty term encourages the model to learn simpler and more generalizable patterns by adding a cost to more complex models.
# Drop out
# tf.keras.layers.Dense(16, kernel_initializer='he_normal', activation='relu')

# Building Model

1- Early Stop is going to use in all models -> for better estimation and reducing the number of repititions.
2- Epochs -> consider 100 which is seems to be too much for 300 rows.
Update -> 100 seems not enough because early stop didnt active and chart still have space to go! -> try 200

In [11]:
@measure_execution_time
def build_model(model, epochs, loss, optimizer, metric, batch_size=32, save=False):
    tf.keras.backend.clear_session()
    early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    model.compile(loss=loss,
                  optimizer=optimizer,
                  metrics=[metric])
    model.summary()
    if save:
        model.save('model')
    history = model.fit(X_train_scl, y_train_scl,
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(X_val_scl, y_val_scl),
                        callbacks=[early_stopping_cb])
    return history, early_stopping_cb

__PERFORMANCE MODEL__

In [12]:
batch_size = 32  # Large: speed and faster, small: generalization
epochs = 200
hidden_layer_unit = 8  # Total number of 300 record! First try is 8!
hidden_layer_activation = 'relu'  # This is a hidden layer, Relu is good choice for this section

model1 = tf.keras.Sequential([
    tf.keras.Input(shape=(X_train_scl.shape[1],)),
    tf.keras.layers.Dense(units=hidden_layer_unit, activation=hidden_layer_activation),
    tf.keras.layers.Dense(units=hidden_layer_unit, activation=hidden_layer_activation),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
    # This is classification output -> Sigmoid or softmax are the best chooses
])

loss = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
metric = tf.keras.metrics.BinaryAccuracy()

history1 ,early_stop1 = build_model(model1, epochs, loss, optimizer, metric ,batch_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 104       
                                                                 
 dense_1 (Dense)             (None, 8)                 72        
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 185
Trainable params: 185
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 

In [13]:
batch_size = 16  # Large: speed and faster, small: generalization
epochs = 200
hidden_layer_unit = 16  # Total number of 300 record! First try is 8!
hidden_layer_activation = 'relu'  # This is a hidden layer, Relu is good choice for this section

model2 = tf.keras.Sequential([
    tf.keras.Input(shape=(X_train_scl.shape[1],)),
    tf.keras.layers.Dense(units=hidden_layer_unit, activation=hidden_layer_activation),
    tf.keras.layers.Dense(units=hidden_layer_unit, activation=hidden_layer_activation),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
    # This is classification output -> Sigmoid or softmax are the best chooses
])

loss = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
metric = tf.keras.metrics.BinaryAccuracy()

history2 ,early_stop2 = build_model(model2, epochs, loss, optimizer, metric ,batch_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                208       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 497
Trainable params: 497
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 

In [16]:
batch_size = 8  # Large: speed and faster, small: generalization
epochs = 200
hidden_layer_unit = 32
hidden_layer_activation = 'selu'

model_3 = tf.keras.Sequential([
    tf.keras.Input(shape=(X_train_scl.shape[1],)),

    tf.keras.layers.Dense(units=hidden_layer_unit, kernel_initializer=tf.initializers.LecunNormal(),
                           kernel_regularizer=tf.keras.regularizers.l1_l2()),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation(hidden_layer_activation),

    tf.keras.layers.Dense(units=hidden_layer_unit, kernel_initializer=tf.initializers.LecunNormal(),
                          kernel_regularizer=tf.keras.regularizers.l1_l2()),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation(hidden_layer_activation),

    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

loss = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.01, nesterov=True)
metric = tf.keras.metrics.BinaryAccuracy()

history_3, early_stop_3 = build_model(model_3, epochs, loss, optimizer, metric,
                                                          batch_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                416       
                                                                 
 batch_normalization (BatchN  (None, 32)               128       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 batch_normalization_1 (Batc  (None, 32)               128       
 hNormalization)                                                 
                                                                 
 activation_1 (Activation)   (None, 32)                0

__Speed Model__

In [15]:
batch_size = 32  # Large: speed and faster, small: generalization
epochs = 200
hidden_layer_unit = 32
hidden_layer_activation = 'leaky_relu'

model_best = tf.keras.Sequential([
    tf.keras.Input(shape=(X_train_scl.shape[1],)),
    tf.keras.layers.Dense(units=16, activation=hidden_layer_activation,
                          kernel_initializer=tf.initializers.HeNormal()),
    tf.keras.layers.Dense(units=16, activation=hidden_layer_activation,
                          kernel_initializer=tf.initializers.HeNormal()),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

loss = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam()
metric = tf.keras.metrics.BinaryAccuracy()

history_best, early_stop_speed = build_model(model_best, epochs, loss, optimizer, metric, batch_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                208       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 497
Trainable params: 497
Non-trainable params: 0
_________________________________________________________________
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 

And this model seems to have a good balance between all the provided models

### (Bonus point) Save your model.


The Code is implemented in build_model function as model.save()
this code is same as tf.keras.Model.save
for loading that we call tf.keras.models.load_model('saved_model/my_model') function