In [1]:
# Let us begin
print("Hello there world")
print("Let's get up close with Google Platform AI Notebooks.")

Hello there world
Let's get up close with Google Platform AI Notebooks.


In [2]:
# basic setup (per Google demo)
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder


In [3]:
# let's grab the natality data set (or at least some of it)
# from BigQuery

query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
df.head()


Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,7.568469,True,22,1,46.0
1,8.807467,True,39,1,42.0
2,8.313632,True,23,1,35.0
3,8.000575,False,27,1,40.0
4,6.563162,False,29,1,39.0


In [4]:
# Let's see what's in there
df.describe()

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,9991.0,10000.0,10000.0,9888.0
mean,7.278609,27.3653,1.0303,38.681634
std,1.354406,6.235699,0.183808,2.622498
min,0.500449,12.0,1.0,19.0
25%,6.624891,22.0,1.0,38.0
50%,7.374463,27.0,1.0,39.0
75%,8.124034,32.0,1.0,40.0
max,12.936726,51.0,4.0,47.0


In [5]:
# Let's check out our boolean column, is_male

df["is_male"].value_counts()

True     5190
False    4810
Name: is_male, dtype: int64

In [6]:
# Time to prep for training

df = df.dropna()
df = shuffle(df, random_state=2)


In [7]:
# and do some labeling and convert boolean to int

labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
data['is_male'] = data['is_male'].astype(int)

In [8]:
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
39,8.875811,True,25,1,40.0
6130,8.437091,False,20,1,40.0
5986,6.250105,False,26,1,38.0
7683,6.499227,False,23,1,41.0
4914,6.874013,False,24,1,39.0


In [9]:
# what's next?
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9883 entries, 39 to 7427
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   weight_pounds    9883 non-null   float64
 1   is_male          9883 non-null   bool   
 2   mother_age       9883 non-null   int64  
 3   plurality        9883 non-null   int64  
 4   gestation_weeks  9883 non-null   float64
dtypes: bool(1), float64(2), int64(2)
memory usage: 395.7 KB


In [10]:
# okay let's get started with some x, y training

x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [11]:
# We'll now build our model via Keres stacks

model = Sequential([
    Dense(64, activation='relu', input_shape=(len(x_train.iloc[0]),)),
    Dense(32, activation='relu'),
    Dense(1)]
)

In [12]:
# Now we'll commpile our regression model

model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mae', 'mse'])

In [13]:
# How'd we do?

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                320       
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,433
Trainable params: 2,433
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Time to fit the model

model.fit(x_train, y_train, epochs=10, validation_split=0.1)


Train on 6670 samples, validate on 742 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fda78528d50>

In [15]:
# Okay, let's now test out the first ten rows

num_examples = 10
predictions = model.predict(x_test[:num_examples])


### This is Markdown! ###


In [16]:
# and iterate over those, comparing our results to the training set

for i in range(num_examples):
    print('Predicted val: ', predictions[i][0])
    print('Actual val: ',y_test.iloc[i])
    print()

Predicted val:  5.2955027
Actual val:  2.74916440714

Predicted val:  7.069784
Actual val:  8.0578956761

Predicted val:  7.0480647
Actual val:  7.68751907594

Predicted val:  6.45994
Actual val:  7.62578964258

Predicted val:  6.083092
Actual val:  8.24969784404

Predicted val:  4.9309306
Actual val:  5.74965579296

Predicted val:  7.326416
Actual val:  10.476366690239999

Predicted val:  5.8661847
Actual val:  4.87442061282

Predicted val:  7.0928397
Actual val:  7.3744626639

Predicted val:  6.965263
Actual val:  6.60945861476



In [17]:
# Day 2 of testing
# We start with the setup for Google's What If Tool

wit_data = pd.concat([x_test, y_test], axis=1)

In [18]:
wit_data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks,weight_pounds
7957,1,24,1,31.0,2.749164
5862,1,24,1,39.0,8.057896
2640,1,20,1,39.0,7.687519
1869,1,38,1,38.0,7.62579
9072,0,33,1,37.0,8.249698


In [19]:
# we'll need to write a function that will run examples against our predictions

def custom_predict(examples_to_infer):
    preds = model.predict(examples_to_infer)
    return preds

In [20]:
# We'll instantiate the What-If Tool by passing it 500 examples from 
# the concatenated test dataset + ground truth labels we just created

config_builder = (WitConfigBuilder(wit_data[:500].values.tolist(), data.columns.tolist() + ['weight_pounds'])
  .set_custom_predict_fn(custom_predict)
  .set_target_feature('weight_pounds')
  .set_model_type('regression'))
WitWidget(config_builder, height=800)


WitWidget(config={'model_type': 'regression', 'label_vocab': [], 'feature_names': ['is_male', 'mother_age', 'p…