## Step 1.  Import libraries

In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from sklearn import preprocessing

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Step 2. Extract the data from the csv

In [2]:
raw_csv_data = np.loadtxt('data.csv',delimiter=',')
# excludng the ID and the targets from dataset 
unscaled_inputs_all = raw_csv_data[:,1:-1] 
target_all = raw_csv_data[:,-1]

## Step 3.  Balancing the dataset 

In [3]:
num_one_targets = int(np.sum(target_all))

print("Shape (numbre of line) :", target_all.shape[0] )
print("the number of targets that has One : ", num_one_targets)
print("the number of targets that has Zero :", target_all.shape[0] - num_one_targets)

zero_targets_counter = 0
indices_to_remove = []

for i in range( target_all.shape[0] ):
    if target_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter >  num_one_targets:
            indices_to_remove.append(i)
        
print(" AFTER BALANCING : ")            


unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(target_all, indices_to_remove, axis=0)

print("the number of targets that has One :", int(sum(targets_equal_priors)))
print("the number of targets that has One :", targets_equal_priors.shape[0] - int(sum(targets_equal_priors)) )

Shape (numbre of line) : 14084
the number of targets that has One :  2237
the number of targets that has Zero : 11847
 AFTER BALANCING : 
the number of targets that has One : 2237
the number of targets that has One : 2237


## Step 4. Standardize the inputs 

In [4]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## Step 5. Shuffle the data  

In [5]:
shuffled_indices = np.arange(scaled_inputs.shape[0])

print(" before shuffle :", shuffled_indices)
np.random.shuffle(shuffled_indices)

print(" after shuffle  :" , shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

 before shuffle : [   0    1    2 ... 4471 4472 4473]
 after shuffle  : [1073  334 4314 ... 3835 1533 1223]


## Step 6.  Split the dataset into train, validation and test  

In [6]:
samples_count = shuffled_inputs.shape[0]

print(samples_count)

train_sample_count = int(0.8*samples_count)
validation_sample_count = int(0.1*samples_count)
test_sample_count = samples_count - train_sample_count -validation_sample_count

# let's extract them from the dataset 

# train 
train_inputs = shuffled_inputs[:train_sample_count]
train_targets = shuffled_targets[:train_sample_count]

# validation
validation_inputs = shuffled_inputs[ train_sample_count:train_sample_count + validation_sample_count ]
validation_targets = shuffled_targets[ train_sample_count:train_sample_count + validation_sample_count ]

# test 
test_inputs = shuffled_inputs[ train_sample_count+validation_sample_count:]
test_targets = shuffled_targets[ train_sample_count+validation_sample_count:]


print(np.sum(train_targets),train_sample_count,np.sum(train_targets)/train_sample_count)
print(np.sum(validation_targets),validation_sample_count,np.sum(validation_targets)/validation_sample_count)
print(np.sum(test_targets),test_sample_count,np.sum(test_targets)/test_sample_count)
# All three sets are balanced

4474
1777.0 3579 0.4965074043028779
211.0 447 0.4720357941834452
249.0 448 0.5558035714285714


## Step 7.  Save the three datasets *.npz 

In [7]:
np.savez('data_train', input= train_inputs, target=train_targets)
np.savez('data_validation', input= validation_inputs, target=validation_targets)
np.savez('data_test', input= test_inputs, target=test_targets)

## Step 8. Create a class that will do the batching

In [8]:
class Reader():
    
   
    def __init__(self,dataset,batch_size = None):

        npz = np.load('data_{0}.npz'.format(dataset) )
        self.inputs, self.targets = npz['input'].astype(np.float), npz['target'].astype(np.int)
        
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self.inputs.shape[0] // self.batch_size
    
    
    
    # A method which loads the next batch 
    # the next function slices the next batch out of the dataset and load it 
    def __next__(self):
        
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
        
        # You slice the dataset in batches and then the "next" function loads them one after other 
        batch = slice(self.curr_batch * self.batch_size ,(self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch] 
        targets_batch = self.targets[batch]
        self.curr_batch += 1
  
        
        # the function will return the inputs batch and the targets batch 
        # return inputs_batch, targets_one_hot
        return inputs_batch, targets_batch
    
    # A method needed for iterating over the batches, as we will put them in a loop
    # This tells python that the class we're definig is iterable, i.e that we can  use it 
    def __iter__(self):
        return self 
        

## Step 9. Create the machine learning algorithm   

### Step 9.1.  

In [9]:
input_size = 10
output_size = 2 # such we have 2 input 
hidden_layer_size = 50 # hyperparameter

# To clear the defined variables and operations of the previous cell
tf.reset_default_graph()

### Step 9.2. 

In [10]:
#  A placeholder is simply a variable that we will assign data to at a later date. 
#  It allows us to create our operations and build our computation graph, 
#  without needing the data. In TensorFlow terminology, 
#  we then feed data into the graph through these placeholders.
#  place in memory where we will store value later on
inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])

### Step 9.3.  

In [11]:

### Hiden layer 1 ###
weights_1 = tf.get_variable("weights_1",[input_size,hidden_layer_size])
biases_1 = tf.get_variable("biases_1",[hidden_layer_size])
# time to applay an activation function
outputs_1 = tf.nn.relu(tf.matmul(inputs,weights_1) + biases_1 )


### Hiden layer 2 ###
weights_1 = tf.get_variable("weights_2",[hidden_layer_size,hidden_layer_size])
biases_2 = tf.get_variable("biases_2",[hidden_layer_size])
# time to applay an activation function
outputs_2 = tf.nn.relu(tf.matmul(outputs_1,weights_1) + biases_2)


### Output layers ###
weights_3 = tf.get_variable("weights_3",[hidden_layer_size,output_size])
biases_3 = tf.get_variable("biases_3",[output_size])
outputs = tf.matmul(outputs_2,weights_3) + biases_3

# That's how we stauck layers 

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


### Step 9.4 

In [12]:

# is a function that apply a softmax activation and calculates a cross entropy loss
loss = tf.nn.softmax_cross_entropy_with_logits(logits = outputs, labels = targets) 
# tf.reduce_mean() is a method which finds the mean of the elements of a tensor across a dimension
mean_loss = tf.reduce_mean(loss)


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



### Step 9.5 

In [13]:
# Now we have the model and the loss
# choose the optimization method
optimize = tf.train.AdamOptimizer( learning_rate=0.001 ).minimize(mean_loss)

### Step 9.6 

In [14]:
# tf.equal() is a method checks if two values are equal. in the case of tensors so element-wise
# however to show we only care about the column indices we include a second argument to the Tf.argmax
# compare the argmax of outputs and argmax of targets
outputs_equals_target = tf.equal( tf.argmax(outputs,1),tf.argmax(targets,1) ) 


# example of horse cat dogin this case out_equals_ target = [1,0,1,1] T
# the accuracy is the mean of the out_equals_ target vector mean = (1+0+1+1)/4 = 0,75
# already this done throuth the tf reduce mean 
accuracy = tf.reduce_mean( tf.cast(outputs_equals_target, tf.float32) )

### Step 9.7 

In [15]:
sess = tf.InteractiveSession()
initializer = tf.global_variables_initializer()
sess.run(initializer)

### Step 9.8 

In [16]:
sess = tf.InteractiveSession()
initializer = tf.global_variables_initializer()
sess.run(initializer)


# we want a number small enough to learn faster but big enough to preserve the underlying dependencies
batch_size = 100 

# i'll change the max number of epochs to 50 as i already know how the algo will behave  
max_epochs = 50
# the true early stopping will come if the validation loss starts increasing
# this value large enough to ensure the early stpping won't be triggered on the first epoch
prev_validation_loss = 9999999.

# we preprocessed the data on our. Now we must load it 
# both variables will be instance of the Audiobooks_Data_Reader class
train_data = Reader('train', batch_size)
validation_data = Reader('validation')

for epoch_counter in range(max_epochs):
    ### Training  ###
    curr_epoch_loss = 0.
    # Since the class is iterable, we can iterate over data using the code : 
    for input_batch, target_batch in train_data:
        _, batch_loss = sess.run([optimize,mean_loss], feed_dict ={inputs: input_batch, targets:target_batch } )
        curr_epoch_loss += batch_loss
    # Numbers of batches: train_data    
    curr_epoch_loss /= train_data.batch_count 
    ### end Training  ###
    
    # time to validate 
    # notice : 
    # Audiobooks_("train", 5) take batches of 5 samples at a time 
    # Audiobooks_("validation") take the whole data in a single batch
    validation_loss = 0.
    validation_accuracy = 0.
    # validation 
    for input_batch, target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run([mean_loss,accuracy], feed_dict ={inputs: input_batch, targets:target_batch } )
    
    # so we will simply forward propagate the whole validation data set through the net 
    
    # Forward propagation does not require a loop ( we simply feed the model with data )
    # However, our class was an iterator (to be used with loops )
    # Logically, this loop will always have a single iteration 
    # using a loop won't affect the speed of the algo 
    
    print('Epoch '+  str(epoch_counter+1) +
      '. Trining loss:'+'{0:.3f}'.format(curr_epoch_loss)+
      '. Validation loss:'+'{0:.3f}'.format(validation_loss)+
      '. Validation accuracy:'+'{0:.3f}'.format(validation_accuracy * 100.)+'%'
     )
    
    # breaks the loop when the validation loss i higher than the previous validation 
    if validation_loss > prev_validation_loss:
        break 
    
    prev_validation_loss = validation_loss
    
    # In this way we wil know when the valdation loss starts increasing 

print("End of training.")



ValueError: Cannot feed value of shape (100,) for Tensor 'Placeholder_1:0', which has shape '(?, 2)'