Import Libraries

In [1]:
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import regularizers

!pip install -q git+https://github.com/tensorflow/docs
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots

  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone


Load and prepare dataset

In [2]:
gz = tf.keras.utils.get_file('HIGGS.csv.gz', 'http://mlphysics.ics.uci.edu/data/higgs/HIGGS.csv.gz')

Downloading data from http://mlphysics.ics.uci.edu/data/higgs/HIGGS.csv.gz


In [3]:
FEATURES = 28
ds = tf.data.experimental.CsvDataset(gz,[float(),]*(FEATURES+1), compression_type="GZIP")

def pack_row(*row):
  label = row[0]
  features = tf.stack(row[1:],1)
  return features, label

packed_ds = ds.batch(10000).map(pack_row).unbatch()

In [4]:
N_VALIDATION = int(1e3)
N_TRAIN = int(1e4)
BUFFER_SIZE = int(1e4)
BATCH_SIZE = 500
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE

In [5]:
validate_ds = packed_ds.take(N_VALIDATION).cache()
train_ds = packed_ds.skip(N_VALIDATION).take(N_TRAIN).cache()
validate_ds = validate_ds.batch(BATCH_SIZE)
train_ds = train_ds.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE)

Simple model

In [6]:
model = tf.keras.Sequential([
    layers.Dense(16, activation='elu', input_shape=(FEATURES,)),
    layers.Dense(1)
])

In [7]:
def get_callbacks():
  return [
    tfdocs.modeling.EpochDots(),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=200),
  ]

In [8]:
def compile_and_fit(model, max_epochs=10000):
  model.compile(optimizer="adam",
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[
                  tf.keras.losses.BinaryCrossentropy(
                      from_logits=True, name='binary_crossentropy'),
                  'accuracy'])

  model.summary()

  history = model.fit(
    train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=max_epochs,
    validation_data=validate_ds,
    callbacks=get_callbacks(),
    verbose=0)
  return history

In [9]:
compile_and_fit(model)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                464       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 481
Trainable params: 481
Non-trainable params: 0
_________________________________________________________________

Epoch: 0, accuracy:0.5125,  binary_crossentropy:0.8545,  loss:0.8545,  val_accuracy:0.4890,  val_binary_crossentropy:0.7722,  val_loss:0.7722,  
....................................................................................................
Epoch: 100, accuracy:0.5949,  binary_crossentropy:0.6267,  loss:0.6267,  val_accuracy:0.5570,  val_binary_crossentropy:0.6387,  val_loss:0.6387,  
............................................................

<keras.callbacks.History at 0x7f11301e9150>

# Question 3

define a deeper model which has more capacity, because training accuracy is low and seems underfited.

In [11]:
model = tf.keras.Sequential([
    layers.Dense(256, activation='elu', input_shape=(FEATURES,)),
    layers.Dense(256, activation='elu'),
    layers.Dense(256, activation='elu'),
    layers.Dense(256, activation='elu'),
    layers.Dense(256, activation='elu'),
    layers.Dense(1)
])
compile_and_fit(model)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 256)               7424      
                                                                 
 dense_9 (Dense)             (None, 256)               65792     
                                                                 
 dense_10 (Dense)            (None, 256)               65792     
                                                                 
 dense_11 (Dense)            (None, 256)               65792     
                                                                 
 dense_12 (Dense)            (None, 256)               65792     
                                                                 
 dense_13 (Dense)            (None, 1)                 257       
                                                                 
Total params: 270,849
Trainable params: 270,849
Non-tr

<keras.callbacks.History at 0x7f10c0c10a90>

we have overfitting obviously. because we have 100 percent accuracy on training and a much less accuracy on validation data. the model stopped after only 200 epochs because it hasn't progressed in validation loss. so as explained in the article, we can use weight regularizer and dropout to prevent overfitting.

In [12]:
model = tf.keras.Sequential([
    layers.Dense(256, activation='elu', input_shape=(FEATURES,), kernel_regularizer='l2'),
    layers.Dropout(0.5),
    layers.Dense(256, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.5), 
    layers.Dense(256, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.4),
    layers.Dense(256, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.4),
    layers.Dense(256, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.3),
    layers.Dense(1)
])
compile_and_fit(model, max_epochs=1000)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 256)               7424      
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_15 (Dense)            (None, 256)               65792     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_16 (Dense)            (None, 256)               65792     
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_17 (Dense)            (None, 256)              

<keras.callbacks.History at 0x7f10c09c4310>

but we can see the model didn't work perfect. maybe units of each layer is not enough due to dropout probabilities. 

In [13]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer='l2'),
    layers.Dropout(0.5),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.5), 
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.4),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.4),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.3),
    layers.Dense(1)
])
compile_and_fit(model, max_epochs=1000)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 512)               14848     
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_21 (Dense)            (None, 512)               262656    
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_22 (Dense)            (None, 512)               262656    
                                                                 
 dropout_7 (Dropout)         (None, 512)               0         
                                                                 
 dense_23 (Dense)            (None, 512)              

<keras.callbacks.History at 0x7f10c0715e50>

the model improvement is not satisfying. maybe that's because the droput probability is high.

In [15]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer='l2'),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.2), 
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.2),
    layers.Dense(1)
])
compile_and_fit(model, max_epochs=1000)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_32 (Dense)            (None, 512)               14848     
                                                                 
 dropout_15 (Dropout)        (None, 512)               0         
                                                                 
 dense_33 (Dense)            (None, 512)               262656    
                                                                 
 dropout_16 (Dropout)        (None, 512)               0         
                                                                 
 dense_34 (Dense)            (None, 512)               262656    
                                                                 
 dropout_17 (Dropout)        (None, 512)               0         
                                                                 
 dense_35 (Dense)            (None, 512)              

<keras.callbacks.History at 0x7f10c0710fd0>

we didn,t have big difference here. but again we can't reach the accuracy of the first simple model. maybe our model is very complex. so we decrease number of hidden layers, and increase the probability of dropout again.

In [16]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer='l2'),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.2), 
    layers.Dense(512, activation='elu', kernel_regularizer='l2'),
    layers.Dropout(0.2),
    layers.Dense(1)
])
compile_and_fit(model, max_epochs=1000)

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_38 (Dense)            (None, 512)               14848     
                                                                 
 dropout_20 (Dropout)        (None, 512)               0         
                                                                 
 dense_39 (Dense)            (None, 512)               262656    
                                                                 
 dropout_21 (Dropout)        (None, 512)               0         
                                                                 
 dense_40 (Dense)            (None, 512)               262656    
                                                                 
 dropout_22 (Dropout)        (None, 512)               0         
                                                                 
 dense_41 (Dense)            (None, 1)                

<keras.callbacks.History at 0x7f10b637d750>

we have the problem of underfitting just like before models. as we decrease the dropout rate and it didnt help, we can decrease alpha of kernel regularizer to see what happens.

reducing number of hidden layers didn't have a big effect on results. so we can get them back.

In [17]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.2), 
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.2),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.2),
    layers.Dense(1)
])
compile_and_fit(model, max_epochs=1000)

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_42 (Dense)            (None, 512)               14848     
                                                                 
 dropout_23 (Dropout)        (None, 512)               0         
                                                                 
 dense_43 (Dense)            (None, 512)               262656    
                                                                 
 dropout_24 (Dropout)        (None, 512)               0         
                                                                 
 dense_44 (Dense)            (None, 512)               262656    
                                                                 
 dropout_25 (Dropout)        (None, 512)               0         
                                                                 
 dense_45 (Dense)            (None, 512)              

<keras.callbacks.History at 0x7f10ba189d90>

the model overfitted. we can use another kernel regularizer and we can increase the dropout rate a little.

a combination of L2 and L1 can be reasonable.

In [20]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3), 
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(1)
])
compile_and_fit(model, max_epochs=1000)

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_60 (Dense)            (None, 512)               14848     
                                                                 
 dropout_38 (Dropout)        (None, 512)               0         
                                                                 
 dense_61 (Dense)            (None, 512)               262656    
                                                                 
 dropout_39 (Dropout)        (None, 512)               0         
                                                                 
 dense_62 (Dense)            (None, 512)               262656    
                                                                 
 dropout_40 (Dropout)        (None, 512)               0         
                                                                 
 dense_63 (Dense)            (None, 512)             

<keras.callbacks.History at 0x7f10aff33f90>

this isn't a bad result. this time we try to get better results by changing our optimizer. as we learned that sometimes SGD can achieve better results than Adam.

In [22]:
def compile_and_fit(model, max_epochs=10000):
  model.compile(optimizer=tf.keras.optimizers.SGD(),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[
                  tf.keras.losses.BinaryCrossentropy(
                      from_logits=True, name='binary_crossentropy'),
                  'accuracy'])

  model.summary()

  history = model.fit(
    train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=max_epochs,
    validation_data=validate_ds,
    callbacks=get_callbacks(),
    verbose=0)
  return history

model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3), 
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(1)
])

compile_and_fit(model, max_epochs=1000)

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_66 (Dense)            (None, 512)               14848     
                                                                 
 dropout_43 (Dropout)        (None, 512)               0         
                                                                 
 dense_67 (Dense)            (None, 512)               262656    
                                                                 
 dropout_44 (Dropout)        (None, 512)               0         
                                                                 
 dense_68 (Dense)            (None, 512)               262656    
                                                                 
 dropout_45 (Dropout)        (None, 512)               0         
                                                                 
 dense_69 (Dense)            (None, 512)             

<keras.callbacks.History at 0x7f10afa011d0>

you can see the results where better using adam optimizer. so we don't change that.

using elu the value of layer doesn't get high but with relu they can. so mabe that can increase our accuracy.

In [31]:
def compile_and_fit(model, max_epochs=10000):
  model.compile(optimizer="adam",
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[
                  tf.keras.losses.BinaryCrossentropy(
                      from_logits=True, name='binary_crossentropy'),
                  'accuracy'])

  model.summary()

  history = model.fit(
    train_ds,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs=max_epochs,
    validation_data=validate_ds,
    callbacks=get_callbacks(),
    verbose=0)
  return history

model = tf.keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3), 
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.3),
    layers.Dense(1)
])

compile_and_fit(model, max_epochs=1000)

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_88 (Dense)            (None, 512)               14848     
                                                                 
 dropout_60 (Dropout)        (None, 512)               0         
                                                                 
 dense_89 (Dense)            (None, 512)               262656    
                                                                 
 dropout_61 (Dropout)        (None, 512)               0         
                                                                 
 dense_90 (Dense)            (None, 512)               262656    
                                                                 
 dropout_62 (Dropout)        (None, 512)               0         
                                                                 
 dense_91 (Dense)            (None, 512)             

<keras.callbacks.History at 0x7f10bdb02d50>

training accuracy improved. but the model overfitted. so we can increase the rate of dropout again :)

In [34]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.5), 
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(1)
])

compile_and_fit(model, max_epochs=1000)

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_106 (Dense)           (None, 512)               14848     
                                                                 
 dropout_75 (Dropout)        (None, 512)               0         
                                                                 
 dense_107 (Dense)           (None, 512)               262656    
                                                                 
 dropout_76 (Dropout)        (None, 512)               0         
                                                                 
 dense_108 (Dense)           (None, 512)               262656    
                                                                 
 dropout_77 (Dropout)        (None, 512)               0         
                                                                 
 dense_109 (Dense)           (None, 512)             

<keras.callbacks.History at 0x7f10af78a550>

as we overfit again, maybe change the kernel regularizer to L1 sill help us because it is stronger than L2.

In [38]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L1(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1(0.0001)),
    layers.Dropout(0.5), 
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.L1(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(1)
])

compile_and_fit(model, max_epochs=1000)

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_130 (Dense)           (None, 512)               14848     
                                                                 
 dropout_95 (Dropout)        (None, 512)               0         
                                                                 
 dense_131 (Dense)           (None, 512)               262656    
                                                                 
 dropout_96 (Dropout)        (None, 512)               0         
                                                                 
 dense_132 (Dense)           (None, 512)               262656    
                                                                 
 dropout_97 (Dropout)        (None, 512)               0         
                                                                 
 dense_133 (Dense)           (None, 512)             

<keras.callbacks.History at 0x7f10ac2b3850>

we can feel that with L1 the model can't converge well. so we use L2 again.

we get bach to elu activation function because with relu the model overfitting probability increases.

we saw that with these parameteres the model acted well but I think that a deeper model can be better here.

In [39]:
model = tf.keras.Sequential([
    layers.Dense(512, activation='elu', input_shape=(FEATURES,), kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5), 
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='elu', kernel_regularizer=regularizers.L2(0.0001)),
    layers.Dropout(0.5),
    layers.Dense(1)
])

compile_and_fit(model, max_epochs=1000)

Model: "sequential_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_136 (Dense)           (None, 512)               14848     
                                                                 
 dropout_100 (Dropout)       (None, 512)               0         
                                                                 
 dense_137 (Dense)           (None, 512)               262656    
                                                                 
 dropout_101 (Dropout)       (None, 512)               0         
                                                                 
 dense_138 (Dense)           (None, 512)               262656    
                                                                 
 dropout_102 (Dropout)       (None, 512)               0         
                                                                 
 dense_139 (Dense)           (None, 512)             

<keras.callbacks.History at 0x7f10ac2a6b50>

this seems the best results we had. 