<a href="https://colab.research.google.com/github/boscherj/tensorflow/blob/master/iris_Tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""An Example of a DNNClassifier for the Iris dataset."""

# https://www.tensorflow.org/get_started/get_started_for_beginners
# J'ai modifié l'exemple Getting Started for ML Beginners
# J'ai supprimé le import iris_data et j'ai intégré ici les fonctions définies dans le fichier iris_data
# Je commente à la fois le Python et le TensorFlow

# https://docs.python.org/2/library/__future__.html
# __future__ is a real module, and serves three purposes:
# To avoid confusing existing tools that analyze import statements and expect to find the modules they’re importing.
# To ensure that future statements run under releases prior to 2.1 at least yield runtime exceptions 
# (the import of __future__ will fail, because there was no module of that name prior to 2.1).
# To document when incompatible changes were introduced, and when they will be — or were — made mandatory. 
# This is a form of executable documentation, and can be inspected programmatically via 
# importing __future__ and examining its contents.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# https://docs.python.org/3/library/argparse.html#argumentparser-objects
# parser = argparse.ArgumentParser()
import argparse
import tensorflow as tf
import pandas as pd

In [0]:
# https://docs.python.org/3/library/argparse.html#argparse.ArgumentParser.add_argument
# parser.add_argument('--batch_size', default=100, type=int, help='batch size')
# parser.add_argument('--train_steps', default=1000, type=int, help='number of training steps')
# J'ai remplacé le argv par des constantes

# Test set accuracy: 0.967 si je mets CONST_BATCH_SIZE = 100
# Test set accuracy: 0.967 si je mets CONST_BATCH_SIZE = 1000
# CONST_BATCH_SIZE = 1000 par défaut
CONST_BATCH_SIZE = 1000

# Test set accuracy: 0.533 si je mets CONST_BATCH_SIZE = 1000 et CONST_TRAINING_STEPS = 10
# Test set accuracy: 0.533 si je mets CONST_BATCH_SIZE = 1000 et CONST_TRAINING_STEPS = 200
# Si je mets CONST_TRAINING_STEPS = 50 alors Test set accuracy: 0.833

# CONST_TRAINING_STEPS = 100 par défaut
CONST_TRAINING_STEPS = 100


TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
# 120,4,setosa,versicolor,virginica
# 6.4,2.8,5.6,2.2,2
# 5.0,2.3,3.3,1.0,1
# 4.9,2.5,4.5,1.7,2
# ...
TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
# 30,4,setosa,versicolor,virginica
# 5.9,3.0,4.2,1.5,1
# 6.9,3.1,5.4,2.1,2
# 5.1,3.3,1.7,0.5,0
# 6.0,3.4,4.5,1.6,1

# 0 represents setosa
# 1 represents versicolor
# 2 represents virginica

CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth',
                    'PetalLength', 'PetalWidth', 'Species']
SPECIES = ['Setosa', 'Versicolor', 'Virginica']

In [0]:
def maybe_download():
    
    # https://www.tensorflow.org/api_docs/python/tf/keras/utils/get_file
    # Downloads a file from a URL if it not already in the cache.

    # By default the file at the url origin is downloaded to the cache_dir ~/.keras, placed in 
    # the cache_subdir datasets, and given the filename fname. 
    # The final location of a file example.txt would therefore be ~/.keras/datasets/example.txt.
    # get_file(
    #   fname,
    #   origin,
    # ...
    # )
    # On fait d'abord un split sur /
    # TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
    # ['http:', '', 'download.tensorflow.org', 'data', 'iris_training.csv']
    # Puis [-1] pour obtenir le dernier élément de la liste, soit iris_training.csv ici
    # Le résultat obtenu pour train_path est sur mon Mac : /Users/xxx/.keras/datasets/iris_training.csv
    train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
    test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)

    return train_path, test_path



In [0]:
def load_data(y_name='Species'):
    """Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
    train_path, test_path = maybe_download()

    train = pd.read_csv(train_path, names=CSV_COLUMN_NAMES, header=0)
    # train : 
    #
    #     SepalLength  SepalWidth  PetalLength  PetalWidth
    # 0            6.4         2.8          5.6         2.2
    # 1            5.0         2.3          3.3         1.0
    # 2            4.9         2.5          4.5         1.7
    # 3            4.9         3.1          1.5         0.1
    # ...
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pop.html
    # Return item and drop from frame. 
    train_x, train_y = train, train.pop(y_name)

    test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
    
    # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pop.html
    # Return item and drop from frame. 
    test_x, test_y = test, test.pop(y_name)

    return (train_x, train_y), (test_x, test_y)

In [0]:
def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    # Creates a Dataset whose elements are slices of the given tensors.
    # https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/data/Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    # Randomly shuffles the elements of this dataset.
    # https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    # buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset
    # from which the new dataset will sample.
    dataset = dataset.shuffle(CONST_BATCH_SIZE).repeat().batch(batch_size)

    # Build the Iterator, and return the read end of the pipeline.
    # https://www.tensorflow.org/programmers_guide/datasets#creating_an_iterator
    # A one-shot iterator is the simplest form of iterator, which only supports iterating 
    # once through a dataset, with no need for explicit initialization. One-shot iterators handle 
    # almost all of the cases that the existing queue-based input pipelines support, 
    # but they do not support parameterization.
    
    # Creates an Iterator for enumerating the elements of this dataset.
    # https://www.tensorflow.org/versions/r1.2/api_docs/python/tf/contrib/data/Dataset
    # https://www.tensorflow.org/api_docs/python/tf/data/Dataset
    # Creates an Iterator for enumerating the elements of this dataset.
    return dataset.make_one_shot_iterator().get_next()
    # return dataset

In [0]:
def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)
    

    # Convert the inputs to a Dataset.
    # dataset = tf.data.Dataset.from_tensor_slices(inputs)
    # dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    # return dataset
    return dataset.make_one_shot_iterator().get_next()
    # return dataset

In [0]:
# On peut aussi lire les données avec sklearn mais je ne vais pas le faire car 
# je veux être sur que le training set et le test set restent composés comme 
# dans l'exercice
from sklearn.datasets import load_iris
data = load_iris()
data.DESCR

In [0]:
# ---------------------- C'est ici que ça commence ----------------------
#
#
# -----------------------------------------------------------------------

# Je ne trouve pas de doc assez claire sur le tf.logging.INFO
# tf.logging.set_verbosity(tf.logging.INFO)

(train_x, train_y), (test_x, test_y) = load_data() 


In [27]:
train_x.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,6.4,2.8,5.6,2.2
1,5.0,2.3,3.3,1.0
2,4.9,2.5,4.5,1.7
3,4.9,3.1,1.5,0.1
4,5.7,3.8,1.7,0.3


In [31]:
train_y.head()

0    2
1    1
2    2
3    0
4    0
Name: Species, dtype: int64

In [33]:
test_x.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,5.9,3.0,4.2,1.5
1,6.9,3.1,5.4,2.1
2,5.1,3.3,1.7,0.5
3,6.0,3.4,4.5,1.6
4,5.5,2.5,4.0,1.3


In [34]:
test_y.head()

0    1
1    2
2    0
3    1
4    1
Name: Species, dtype: int64

In [0]:

# Feature columns describe how to use the input.
# A feature column is a data structure that tells your model how to interpret the data 
# in each feature. In the Iris problem, we want the model to interpret the data in each feature 
# as its literal floating-point value; that is, we want the model to interpret an input value 
# like 5.4 as, well, 5.4. However, in other machine learning problems, it is often desirable to interpret data less literally.

my_feature_columns = []

# my_feature_columns 
# 	[
# 		_NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 
# 		_NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 
# 		_NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), 
# 		_NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)
# 	]
for key in train_x.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [36]:
my_feature_columns

[NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [37]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
# To implement a neural network, the premade_estimators.py program uses a pre-made Estimator 
# named tf.estimator.DNNClassifier. This Estimator builds a neural network that classifies examples. 
# Nous avons un Réseau de Neurons avec 3 couches 
# Chaque couche a 10 neurons
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    # Use the hidden_units parameter to define the number of neurons in each hidden layer of the neural network. 
    # The length of the list assigned to hidden_units identifies the number of hidden layers (2, in this case). 
    # Each value in the list represents the number of neurons in a particular hidden 
    # layer (10 in the first hidden layer and 10 in the second hidden layer). 
    # To change the number of hidden layers or neurons, simply assign a different list to the hidden_units parameter.
    
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

# Train the Model.
# classifier.train(input_fn=lambda:iris_data.train_input_fn(train_x, train_y,100),steps=1000)
classifier.train(input_fn=lambda:train_input_fn(train_x, train_y, CONST_BATCH_SIZE), steps=CONST_TRAINING_STEPS)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp9qayr03c', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5f1b6ac160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph wa

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x7f5f1b6ac780>

In [38]:
# Evaluate the model.
eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(test_x, test_y, CONST_BATCH_SIZE))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-04-23T15:49:00Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp9qayr03c/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-04-23-15:49:00
INFO:tensorflow:Saving dict for global step 100: accuracy = 0.96666664, average_loss = 0.31874135, global_step = 100, loss = 9.562241
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 100: /tmp/tmp9qayr03c/model.ckpt-100

Test set accuracy: 0.967



In [0]:
# Generate predictions from the model
expected = ['Setosa', 'Versicolor', 'Virginica']

predict_x = {
    'SepalLength': [5.1, 5.9, 6.9],
    'SepalWidth': [3.3, 3.0, 3.1],
    'PetalLength': [1.7, 4.2, 5.4],
    'PetalWidth': [0.5, 1.5, 2.1],
}

# 	SepalLength		SepalWidth	PetalLength		PetalWidth
# 		5.1			3.3				1.7				0.5
# 		5.9			3.0				4.2				1.5
# 		6.9			3.1				5.4				2.1

# https://www.tensorflow.org/api_docs/python/tf/estimator/DNNClassifier
predictions = classifier.predict(input_fn=lambda:eval_input_fn(predict_x,labels=None,batch_size=CONST_BATCH_SIZE))

In [0]:
# zip
# This function returns a list of tuples, where the i-th tuple contains the i-th element from each of 
# the argument sequences or iterables. The returned list is truncated in length to the length of the 
# shortest argument sequence. When there are multiple arguments which are all of the same length, 
# zip() is similar to map() with an initial argument of None. 
# With a single sequence argument, it returns a list of 1-tuples. With no arguments, it returns an empty list.

# >>> x = [1, 2, 3]
# >>> y = [4, 5, 6]
# >>> zipped = zip(x, y)
# >>> zipped
# [(1, 4), (2, 5), (3, 6)]

In [41]:
for pred_dict, expec in zip(predictions, expected):
    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]
    print(template.format(SPECIES[class_id], 100 * probability, expec))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp9qayr03c/model.ckpt-100
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Prediction is "Setosa" (97.6%), expected "Setosa"

Prediction is "Versicolor" (59.5%), expected "Versicolor"

Prediction is "Virginica" (73.5%), expected "Virginica"
