<a href="https://colab.research.google.com/github/caropilardiaz/DeepLearning/blob/master/AprendizajeProfundo_Practico1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Aprendizaje Profundo - Pr&aacute;ctico 1**

Integrantes:


*   Buzzi, Sergio
*   Diaz, Carolina
*   Fabro, Juan



# **Consigna:**

1.   Construir un pipeline de clasificación con un modelo Keras MLP. Pueden comenzar con una versión simplicada que sólo tenga una capa de Input donde pasen los valores de las columnas de *one-hot-encodings*.
2.   Entrenar uno o varios modelos (con dos o tres es suficiente, veremos más de esto en el práctico 2). Evaluar los modelos en el conjunto de dev y test.

In [0]:
%%bash
pip install --upgrade pip
pip install --upgrade tensorflow-gpu==2.0.0
pip install --upgrade mlflow graphviz pydot

In [0]:
import os
import numpy
import pandas
import seaborn
import argparse
import mlflow
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models

seaborn.set_style('whitegrid')
seaborn.set_palette('colorblind')
seaborn.set_context('paper')

TARGET_COL = 'AdoptionSpeed'
 
SHUFFLE_BUFFER_SIZE = 100

In [0]:
print("TensorFlow Version: {} - Is GPU available: {}".format(tf.__version__, tf.test.is_gpu_available()))

TensorFlow Version: 2.0.0 - Is GPU available: True


In [0]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
DATA_DIRECTORY = '/content/drive/My Drive/practico_aprendizaje_profundo/petfinder_dataset/'

In [0]:
def load_dataset(dataset_dir):

    # Read train dataset (and maybe dev, if you need to...)
    dataset, dev_dataset = train_test_split(
        pandas.read_csv(os.path.join(dataset_dir, 'train.csv')), test_size=0.2)
         
    test_dataset = pandas.read_csv(os.path.join(dataset_dir, 'test.csv'))
    
    print('Training samples {}, test_samples {}'.format(
        dataset.shape[0], test_dataset.shape[0]))
    
    return dataset, dev_dataset, test_dataset


In [0]:
def process_features(df, one_hot_columns, numeric_columns, embedded_columns, test=False):
    direct_features = []

    # Create one hot encodings
    for one_hot_col, max_value in one_hot_columns.items():
        direct_features.append(tf.keras.utils.to_categorical(df[one_hot_col] - 1, max_value))
       
    
    # Concatenate all features that don't need further embedding into a single matrix.
    features = {'direct_features': numpy.hstack(direct_features)}

    # Create embedding columns - nothing to do here. We will use the zero embedding for OOV
    for embedded_col in embedded_columns.keys():
        features[embedded_col] = df[embedded_col].values

    # Agregado por JPA -- Create and append numeric columns - Don't forget to normalize!
    for n_col in numeric_columns:
        features[n_col] =  df[n_col].values - df[n_col].mean() / df[n_col].std()
        
    if not test:
        nlabels = df[TARGET_COL].unique().shape[0]
        # Convert labels to one-hot encodings
        targets = tf.keras.utils.to_categorical(df[TARGET_COL], nlabels)
    else:
        targets = None
    
    return features, targets

In [0]:
dataset, dev_dataset, test_dataset = load_dataset(DATA_DIRECTORY)
nlabels = dataset[TARGET_COL].unique().shape[0]

Training samples 8465, test_samples 4411


In [0]:
# a Esto lo hacemos para ver en que columnas usamos one-hot-encoding y en cuales embeddings
print("Cantidad de valores distintos por feature:")
print("******************************")
for i in dataset.columns.values:
    print(i + ": " + str(dataset[i].unique().shape[0]))

Cantidad de valores distintos por feature:
******************************
Type: 2
Age: 95
Breed1: 159
Breed2: 109
Gender: 3
Color1: 7
Color2: 7
Color3: 6
MaturitySize: 4
FurLength: 3
Vaccinated: 3
Dewormed: 3
Sterilized: 3
Health: 3
Quantity: 19
Fee: 63
State: 13
Description: 8042
AdoptionSpeed: 5
PID: 8465


En base a estos resultados definimos a que features aplicamos one-hot y a cuales embeddings

In [0]:

one_hot_columns = {
    one_hot_col: dataset[one_hot_col].max()
    for one_hot_col in ['Type', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength',
                       'Vaccinated', 'Dewormed', 'Sterilized', 'Health']
}
embedded_columns = {
    embedded_col: dataset[embedded_col].max() + 1
    for embedded_col in ['Breed1', 'Breed2', 'State']
}
numeric_columns = ['Age', 'Fee', 'Quantity']

In [0]:
# Obtenemos los distintos datasets (train, validation y test)
BATCH_SIZE = 100

X_train, y_train = process_features(dataset, one_hot_columns, numeric_columns, embedded_columns)

train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(BATCH_SIZE).shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)

X_dev, y_dev = process_features(dev_dataset, one_hot_columns, numeric_columns, embedded_columns)

test_ds = tf.data.Dataset.from_tensor_slices((X_dev, y_dev)).batch(BATCH_SIZE)

In [0]:
X_kagg, y_kagg = process_features(test_dataset, one_hot_columns, numeric_columns, embedded_columns, test=True)

kagg_ds = tf.data.Dataset.from_tensor_slices(X_kagg).batch(BATCH_SIZE)

In [0]:
tf.keras.backend.clear_session()

In [0]:
DROPOUT_RATE = 0.25
HIDDEN_LAYER_SIZE = 36
DIRECT_FEATURES_INPUT_SHAPE = (X_train['direct_features'].shape[1],)

In [0]:
# Add one input and one embedding for each embedded column
embedding_layers = []
inputs = []

for embedded_col, max_value in embedded_columns.items():
    input_layer = layers.Input(shape=(1,), name=embedded_col)
    inputs.append(input_layer)
    # Define the embedding layer
    embedding_size = int(max_value / 4)
    embedding_layers.append(
        tf.squeeze(layers.Embedding(input_dim=max_value, output_dim=embedding_size)(input_layer), axis=-2))
    
    print('Adding embedding of size {} for layer {}'.format(embedding_size, embedded_col))

# Add the direct features already calculated
direct_features_input = layers.Input(shape=DIRECT_FEATURES_INPUT_SHAPE, name='direct_features')
inputs.append(direct_features_input)
            
# Concatenate everything together
features = layers.concatenate(embedding_layers + [direct_features_input])

dense1 = layers.Dense(HIDDEN_LAYER_SIZE, activation='relu')(features)

drop1 = layers.Dropout(DROPOUT_RATE)(dense1)

dense2 = layers.Dense(HIDDEN_LAYER_SIZE / 2, activation='relu')(drop1)

drop2 = layers.Dropout(DROPOUT_RATE)(dense2)

dense3 = layers.Dense(HIDDEN_LAYER_SIZE / 4, activation='relu')(drop2)

output_layer = layers.Dense(nlabels, activation='softmax')(dense3)

model = models.Model(inputs=inputs, outputs=output_layer)


Adding embedding of size 77 for layer Breed1
Adding embedding of size 77 for layer Breed2
Adding embedding of size 10350 for layer State


In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Breed1 (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
Breed2 (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
State (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 77)        23716       Breed1[0][0]                     
______________________________________________________________________________________________

In [0]:
import mlflow
mlflow.set_experiment('ejemplo')

with mlflow.start_run(nested=True):
    # Log model hiperparameters first
    mlflow.log_param('hidden_layer_size', HIDDEN_LAYER_SIZE)
    mlflow.log_param('embedded_columns', embedded_columns)
    mlflow.log_param('one_hot_columns', one_hot_columns)
    mlflow.log_param('numerical_columns', numeric_columns)  
    mlflow.log_param('train_dataset.shuffke', True)  
    mlflow.log_param('dropout', DROPOUT_RATE)
    # Train
    epochs = 12
    history = model.fit(train_ds, epochs=epochs)
    
    # Evaluate
    loss, accuracy = model.evaluate(test_ds)
    print("*** Test loss: {} - accuracy: {}".format(loss, accuracy))
    mlflow.log_metric('epochs', epochs)
    mlflow.log_metric('loss', loss)
    mlflow.log_metric('accuracy', accuracy)

In [0]:
epochs = 13
history = model.fit(train_ds, epochs=epochs)
history

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13


<tensorflow.python.keras.callbacks.History at 0x7fde5d46dac8>

In [0]:
# Visualizamos las predicciones obtenidas
predictions = numpy.argmax(model.predict(test_ds), axis=1)
seaborn.countplot(predictions)

In [0]:
submission = pandas.DataFrame( list(zip( test_dataset['PID'], numpy.argmax(model.predict(kagg_ds), axis=1))), 
                              columns=["PID", "AdoptionSpeed"])

In [0]:
submission.to_csv(DATA_DIRECTORY + "submision_20191113_2329.csv", header=True, index=False)

In [0]:
print(history.history)

{'loss': [1.5401608597507015, 1.4894305121877527, 1.465494959155836, 1.454129143783628, 1.4448032731215628, 1.4374748585988837, 1.428235229696667, 1.4190483054534542, 1.4135040135617378, 1.4086707830429077, 1.4043831196467833, 1.3992141708423593], 'accuracy': [0.25304192, 0.2786769, 0.29604253, 0.31329003, 0.31293562, 0.32793856, 0.3344359, 0.33951566, 0.34270525, 0.34790313, 0.3463674, 0.35640875]}


In [0]:
dir(history)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_chief_worker_only',
 '_keras_api_names',
 '_keras_api_names_v1',
 'epoch',
 'history',
 'model',
 'on_batch_begin',
 'on_batch_end',
 'on_epoch_begin',
 'on_epoch_end',
 'on_predict_batch_begin',
 'on_predict_batch_end',
 'on_predict_begin',
 'on_predict_end',
 'on_test_batch_begin',
 'on_test_batch_end',
 'on_test_begin',
 'on_test_end',
 'on_train_batch_begin',
 'on_train_batch_end',
 'on_train_begin',
 'on_train_end',
 'params',
 'set_model',
 'set_params',
 'validation_data']