# Tensorflow Neural Network
In this notebook, I'll take the data, and preprocess it using Tensorflow, and then create a baseline neural network.

In [1]:
# Import packages and libraries
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers

In [2]:
# Load training data into dataframe
X_train = pd.read_csv('../Data/Training_Features.csv')
X_train.drop(columns=['date_recorded', 'permit', 'public_meeting'], inplace=True)
# Drop columns with NaN values
X_train.dropna(axis=1, inplace=True)

y_train = pd.read_csv('../Data/Training_Labels.csv')

# Ordinally encoding the target.
y_train.replace({'functional': 1, 'non functional': 0, 'functional needs repair': 2}, inplace=True)

# Merge data into one frame using the ID column
combined_frame = pd.merge(X_train, y_train, on='id')
combined_frame

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,wpt_name,num_private,basin,region,region_code,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,1390,34.938093,-9.856322,none,0,Lake Nyasa,Iringa,11,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1
1,8776,0.0,1399,34.698766,-2.147466,Zahanati,0,Lake Victoria,Mara,20,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1
2,34310,25.0,686,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Manyara,21,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,1
3,67743,0.0,263,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mtwara,90,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,0
4,19728,0.0,0,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kagera,18,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,1210,37.169807,-3.253847,Area Three Namba 27,0,Pangani,Kilimanjaro,3,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,1
59396,27263,4700.0,1212,35.249991,-9.070629,Kwa Yahona Kuvala,0,Rufiji,Iringa,11,...,soft,good,enough,enough,river,river/lake,surface,communal standpipe,communal standpipe,1
59397,37057,0.0,0,34.017087,-8.750434,Mashine,0,Rufiji,Mbeya,12,...,fluoride,fluoride,enough,enough,machine dbh,borehole,groundwater,hand pump,hand pump,1
59398,31282,0.0,0,35.861315,-6.378573,Mshoro,0,Rufiji,Dodoma,1,...,soft,good,insufficient,insufficient,shallow well,shallow well,groundwater,hand pump,hand pump,1


## Preprocessing

In [3]:
# Split into train, validation, and test set
train, val, test = np.split(combined_frame.sample(frac=1), [int(0.8*len(combined_frame)), int(0.9*len(combined_frame))])

In [4]:
# Confirm expected results
print(len(train), 'training examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

47520 training examples
5940 validation examples
5940 test examples


In [5]:
# Method to convert dataframe to a Tensorflow dataset object
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('status_group')
    df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [6]:
# Set batch size and run method on training data
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}
2022-02-03 13:51:22.683437: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
[(train_features, label_batch)] = train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of tsh:', train_features['amount_tsh'])
print('A batch of targets:', label_batch )

Every feature: ['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'recorded_by', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group']
A batch of tsh: tf.Tensor(
[[  0.]
 [  0.]
 [250.]
 [  0.]
 [200.]], shape=(5, 1), dtype=float64)
A batch of targets: tf.Tensor([0 1 1 0 2], shape=(5,), dtype=int64)


In [8]:
# Create a helper function for layer normalization
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

In [9]:
# Testing helper function with the "amount_tsh" column
amount_tsh_col = train_features['amount_tsh']
layer = get_normalization_layer('amount_tsh', train_ds)
layer(amount_tsh_col)

<tf.Tensor: shape=(5, 1), dtype=float32, numpy=
array([[-0.10559317],
       [-0.10559317],
       [-0.02172938],
       [-0.10559317],
       [-0.03850214]], dtype=float32)>

In [10]:
# Create a helper function for encoding categorical layers
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices.
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integer values into integer indices.
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)

    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Encode the integer indices.
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature))


In [11]:
# Test helper function for encoding categories
test_type_col = train_features['water_quality']
test_type_layer = get_category_encoding_layer(name='water_quality',
                                              dataset=train_ds,
                                              dtype='string')
test_type_layer(test_type_col)

<tf.Tensor: shape=(5, 9), dtype=float32, numpy=
array([[0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>

In [12]:
# Creating new data sets with larger batch sizes.
batch_size = 256
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe.items()}


In [13]:
# make the lists of columns
# num = any columns with numerical value
# ohe = any columns with object value
num_cols = []
ohe_cols = []

for c in X_train.columns:
    if X_train[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    else:
        ohe_cols.append(c)

In [14]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in num_cols:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)


In [15]:
for header in ohe_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(name=header,
                                                 dataset=train_ds,
                                                 dtype='string',
                                                 max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)


## Modeling
All the preperation is now complete. Time to build and compile the model.

In [44]:
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [46]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

In [47]:
model.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15a45ff40>

In [43]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.733501672744751


### Analysis
The neural network does decently for a first time. I think the neural network would perform much better if I pruned some of the noise. Since I put every single column into the model, it is probably reading too much into some of the less useful features.

## Removing Features
Since the amount of features is relativley small, let's only choose features we think may be important this time, and see if that results in improvement

In [52]:
# Hand selected features
num_cols = ['amount_tsh', 'population', 'construction_year']
ohe_cols = ['extraction_type', 'management', 'water_quality', 'quantity', 'source', 'waterpoint_type']

In [53]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in num_cols:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

# Categorical features
for header in ohe_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(name=header,
                                                 dataset=train_ds,
                                                 dtype='string',
                                                 max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [63]:
# Create model
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs, output)

In [64]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=["accuracy"])

model.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15a9d61f0>

In [65]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.7144781351089478


## Analysis 
While removing the other features slightly decreased the accuracy of the model, the decrease was not too significant. I think that at this point, it would be more efficent to optimize a different model, deep neur