<a href="https://colab.research.google.com/github/aydawudu/GCP/blob/main/Tf_Building_Pipelines_draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#import libraries
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing


In [3]:
tf.__version__

'2.8.2'

In [4]:
import pathlib

dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'gs://cloud-training/mlongcp/v3.0_MLonGC/toy_data/petfinder-mini_toy.csv'

tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip


'./datasets/petfinder_mini.zip'

In [5]:
#read in the file
df=pd.read_csv('/content/datasets/petfinder-mini/petfinder-mini.csv')
df.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


#Create Target Columns

In [6]:
# In the original dataset "4" indicates the pet was not adopted.
df['target']=np.where(df['AdoptionSpeed']==4, 0, 1)

#Drop un-used columes
df=df.drop(columns=['AdoptionSpeed', 'Description'])
df.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,1
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,1
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,1
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,1
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,1


In [7]:
# Split dataframe into train, validation, and test
train, test=train_test_split(df, test_size=0.2)
train, val=train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'text examples')

7383 train examples
1846 validation examples
2308 text examples


#Create an input pipleine using tf.data# A utility method to create a tf.data dataset from a Pandas Dataframe


In [8]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(df, shuffle=True, batch_size=32):
  df=df.copy()
  labels=df.pop('target')
  ds=tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds=ds.shuffle(buffer_size=len(df))
  ds=ds.batch(batch_size)
  ds=ds.prefetch(batch_size)
  return ds



In [9]:
batch_size=5
train_ds=df_to_dataset(df=train,shuffle=True, batch_size=batch_size )
train_ds

<PrefetchDataset element_spec=({'Type': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Age': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'Breed1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Gender': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Color1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Color2': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'MaturitySize': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'FurLength': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Vaccinated': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Sterilized': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Health': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'Fee': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'PhotoAmt': TensorSpec(shape=(None,), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [10]:
[(train_features, label_batch)]=train_ds.take(1)
print('Every feature:', list(train_features.keys()))
print('A batch of ages:', train_features['Age'])
print('A batch of target:', label_batch) #returns 5 as the batch_size

Every feature: ['Type', 'Age', 'Breed1', 'Gender', 'Color1', 'Color2', 'MaturitySize', 'FurLength', 'Vaccinated', 'Sterilized', 'Health', 'Fee', 'PhotoAmt']
A batch of ages: tf.Tensor([ 6 28  2  3  6], shape=(5,), dtype=int64)
A batch of target: tf.Tensor([1 1 1 0 1], shape=(5,), dtype=int64)


#Preprocessing Layers

- Normalization - Feature-wise normalization of the data.
- CategoryEncoding - Category encoding layer.
- StringLookup - Maps strings from a vocabulary to integer indices.
- IntegerLookup - Maps integers from a vocabulary to integer indice

#Numeric columns
For each of the Numeric feature, you will use a Normalization() layer to make sure the mean of each feature is 0 and its standard deviation is 1.

get_normalization_layer function returns a layer which applies featurewise normalization to numerical features



In [11]:
def get_normalization_layer(name, dataset):
  #create a normalization layer for our feature
  normalizer=preprocessing.Normalization(axis=None)

  #prepare a Dataset that only yields our feature.
  feature_ds=dataset.map(lambda x, y:x[name])

  #learn the statistics of the data
  normalizer.adapt(feature_ds)

  return normalizer

In [12]:
photo_count_col=train_features['PhotoAmt']
layer=get_normalization_layer('PhotoAmt', train_ds)
layer(photo_count_col)

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-0.8276268, -0.187127 , -0.187127 , -1.1478767, -0.5073769],
      dtype=float32)>

#Categorical columns

In this dataset, Type is represented as a string (e.g. 'Dog', or 'Cat'). You cannot feed strings directly to a model. The preprocessing layer takes care of representing strings as a one-hot vector.

get_category_encoding_layer function returns a layer which maps values from a vocabulary to integer indices and one-hot encodes the features.

In [17]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_tokens=max_tokens)

# TODO
  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))


In [18]:
type_col=train_features['Type']
layer=get_category_encoding_layer('Type', train_ds, 'string')
layer(type_col)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 1.], dtype=float32)>

Often, you don't want to feed a number directly into the model, but instead use a one-hot encoding of those inputs. Consider raw data that represents a pet's age.Often, you don't want to feed a number directly into the model, but instead use a one-hot encoding of those inputs. Consider raw data that represents a pet's age.

In [19]:
type_col=train_features['Age']
get_category_encoding_layer=get_category_encoding_layer('Age', train_ds, 'int64', 5)
get_category_encoding_layer(type_col)

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 1., 1., 0., 0.], dtype=float32)>

##Choosing columns to train on

In [21]:
batch_size=256
train_ds=df_to_dataset(train, batch_size=batch_size)
val_ds=df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds=df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [24]:
all_inputs=[]
encoded_features=[]

#Numeric features.
for header in['PhotoAmt', 'Fee']:
  numeric_col=tf.keras.Input(shape=(1,), name=header)
  normalization_layer=get_normalization_layer(header, train_ds)
  encoded_numeric_col=normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

In [34]:
# Categorical features encoded as integers.
age_col = tf.keras.Input(shape=(1,), name='Age', dtype=None)
encoding_layer = get_category_encoding_layer('Age', train_ds, dtype=None,
                                             max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

TypeError: ignored

In [None]:
z