In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

def prepare_data(data_file_name):
    """
    Responsible for cleaning the data file provided from the UCI machine
    learning repository here: http://archive.ics.uci.edu/ml/datasets/Mushroom.
    The function then produces two CSV files appropriately formatted to be
    used in TensorFlow where the CSV files split with respect to
    training and testing data.
    """

    # The header is formed from the 'agaricus-lepiota.name' file found on
    # http://archive.ics.uci.edu/ml/datasets/Mushroom
    header = ['class', 'cap_shape', 'cap_surface',
              'cap_color', 'bruises', 'odor', 'gill_attachment',
              'gill_spacing', 'gill_size', 'gill_color', 'stalk_shape',
              'stalk_root', 'stalk_surface_above_ring',
              'stalk_surface_below_ring', 'stalk_color_above_ring',
              'stalk_color_below_ring', 'veil_type', 'veil_color',
              'ring_number', 'ring_type', 'spore_print_color',
              'population', 'habitat']
    df = pd.read_csv(data_file_name, sep=',', names=header)

    # Entries with a '?' indicate a missing piece of data, and
    # these entries are dropped from our dataset.
    df.replace('?', np.nan, inplace=True)
    df.dropna(inplace=True)

    # The class of poisonous or edible is indicated in the data as
    # either 'p' or 'e' respectively. We require that this is numeric,
    # and therefore use '0' to indicate poisonous (or not edible) and
    # '1' to indicate edible.
    df['class'].replace('p', 0, inplace=True)
    df['class'].replace('e', 1, inplace=True)

    # Since we are dealing with non-numeric feature data, or in other
    # words, categorical data, we need to replace these with numerical
    # equivalents. Pandas has a nice function called "get_dummies" that
    # performs this task.
    cols_to_transform = header[1:]
    df = pd.get_dummies(df, columns=cols_to_transform)

    # We can now split the data into two separate data frames,
    # one for training, which will constitute the bulk of the
    # data, and one for testing.
    df_train, df_test = train_test_split(df, test_size=0.1)

    # Determine the number of rows and columns in each of the
    # data frames.
    num_train_entries = df_train.shape[0]
    num_train_features = df_train.shape[1] - 1

    num_test_entries = df_test.shape[0]
    num_test_features = df_test.shape[1] - 1

    # The data frames are written as a temporary CSV file, as we still
    # need to modify the header row to include the number of rows and
    # columns present in the training and testing CSV files.
    df_train.to_csv('mushroom_train.csv', index=False)
    df_test.to_csv('mushroom_test.csv', index=False)

    
MUSHROOM_DATA_FILE = "agaricus-lepiota.data"

# Prepare the mushroom data for TensorFlow by
# creating two train / test CSV files.
prepare_data(MUSHROOM_DATA_FILE)

In [4]:
df_train = pd.read_csv('mushroom_train.csv')
print(df_train.head())

train_label = np.array(df_train['class'])
train_data =  np.array(df_train.drop('class', axis=1))

   class  cap_shape_b  cap_shape_c  cap_shape_f  cap_shape_k  cap_shape_s  \
0      0            1            0            0            0            0   
1      0            0            0            0            0            0   
2      1            0            0            0            0            0   
3      1            0            0            0            0            0   
4      1            0            0            1            0            0   

   cap_shape_x  cap_surface_f  cap_surface_g  cap_surface_s  ...  \
0            0              0              0              1  ...   
1            1              0              0              0  ...   
2            1              1              0              0  ...   
3            1              1              0              0  ...   
4            0              0              0              0  ...   

   population_n  population_s  population_v  population_y  habitat_d  \
0             0             0             1             

In [5]:
df_test = pd.read_csv('mushroom_test.csv')
print(df_test.head())
test_label = np.array(df_test['class'])
test_data =  np.array(df_test.drop('class', axis=1))

   class  cap_shape_b  cap_shape_c  cap_shape_f  cap_shape_k  cap_shape_s  \
0      0            0            0            0            0            0   
1      1            0            0            1            0            0   
2      0            0            0            1            0            0   
3      1            0            0            1            0            0   
4      0            0            0            0            0            0   

   cap_shape_x  cap_surface_f  cap_surface_g  cap_surface_s  ...  \
0            1              1              0              0  ...   
1            0              0              0              1  ...   
2            0              0              0              0  ...   
3            0              0              0              0  ...   
4            1              0              0              0  ...   

   population_n  population_s  population_v  population_y  habitat_d  \
0             0             0             0             

In [6]:
print(train_data.shape, ' ', train_label.shape)
print(test_data.shape, ' ', test_label.shape)

(5079, 98)   (5079,)
(565, 98)   (565,)


In [13]:
model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(98,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(2, activation='softmax')
])

model.compile(optimizer=keras.optimizers.Adam(),
             loss=keras.losses.SparseCategoricalCrossentropy(),
             metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                3168      
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
Total params: 4290 (16.76 KB)
Trainable params: 4290 (16.76 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
history = model.fit(train_data, train_label, batch_size=64, epochs=10, validation_split=0.3, verbose=2)

Epoch 1/10
56/56 - 1s - loss: 0.3508 - accuracy: 0.8515 - val_loss: 0.1067 - val_accuracy: 0.9829 - 934ms/epoch - 17ms/step
Epoch 2/10
56/56 - 0s - loss: 0.0478 - accuracy: 0.9958 - val_loss: 0.0197 - val_accuracy: 0.9993 - 126ms/epoch - 2ms/step
Epoch 3/10
56/56 - 0s - loss: 0.0111 - accuracy: 1.0000 - val_loss: 0.0069 - val_accuracy: 1.0000 - 121ms/epoch - 2ms/step
Epoch 4/10
56/56 - 0s - loss: 0.0045 - accuracy: 1.0000 - val_loss: 0.0034 - val_accuracy: 1.0000 - 124ms/epoch - 2ms/step
Epoch 5/10
56/56 - 0s - loss: 0.0024 - accuracy: 1.0000 - val_loss: 0.0020 - val_accuracy: 1.0000 - 124ms/epoch - 2ms/step
Epoch 6/10
56/56 - 0s - loss: 0.0015 - accuracy: 1.0000 - val_loss: 0.0014 - val_accuracy: 1.0000 - 122ms/epoch - 2ms/step
Epoch 7/10
56/56 - 0s - loss: 9.9041e-04 - accuracy: 1.0000 - val_loss: 9.6041e-04 - val_accuracy: 1.0000 - 121ms/epoch - 2ms/step
Epoch 8/10
56/56 - 0s - loss: 7.2023e-04 - accuracy: 1.0000 - val_loss: 7.2796e-04 - val_accuracy: 1.0000 - 118ms/epoch - 2ms/step

In [15]:
model.save('dnn_mushroom_model.h5')

  saving_api.save_model(


In [16]:
new_model = keras.models.load_model('dnn_mushroom_model.h5')
result = new_model.evaluate(test_data, test_label)
print(result)

[0.0003744073328562081, 1.0]
