# Preprocesamiento de datos con Tensorflow

In [1]:
import tensorflow as tf

## 1. Obtener data

Data API

In [2]:
X = tf.range(12)
X

<tf.Tensor: shape=(12,), dtype=int32, numpy=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11], dtype=int32)>

In [3]:
dataset = tf.data.Dataset.from_tensors(X)
dataset

<TensorDataset element_spec=TensorSpec(shape=(12,), dtype=tf.int32, name=None)>

In [4]:
for x in dataset: print(x)

tf.Tensor([ 0  1  2  3  4  5  6  7  8  9 10 11], shape=(12,), dtype=int32)


In [5]:
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [6]:
for x in dataset: print(x)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(11, shape=(), dtype=int32)


Transformaciones

In [7]:
dataset_batch = dataset.repeat(5).batch(10)
for x in dataset_batch: print(x)

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)
tf.Tensor([10 11  0  1  2  3  4  5  6  7], shape=(10,), dtype=int32)
tf.Tensor([ 8  9 10 11  0  1  2  3  4  5], shape=(10,), dtype=int32)
tf.Tensor([ 6  7  8  9 10 11  0  1  2  3], shape=(10,), dtype=int32)
tf.Tensor([ 4  5  6  7  8  9 10 11  0  1], shape=(10,), dtype=int32)
tf.Tensor([ 2  3  4  5  6  7  8  9 10 11], shape=(10,), dtype=int32)


In [8]:
dataset_batch = dataset.repeat(5).shuffle(100,reshuffle_each_iteration=True).batch(10)
for x in dataset_batch: print(x)

tf.Tensor([11 10  1  5  6  0  2  2  0  0], shape=(10,), dtype=int32)
tf.Tensor([ 8  5  9 10  1  1  8  7  6  5], shape=(10,), dtype=int32)
tf.Tensor([ 4  6  5  3 10 11  4  8  6  9], shape=(10,), dtype=int32)
tf.Tensor([ 9  3  3  8  4  1  4 10  0  2], shape=(10,), dtype=int32)
tf.Tensor([ 6  2  2  7  8  1  5 11  9 10], shape=(10,), dtype=int32)
tf.Tensor([ 0  4 11  9  3  7  3  7 11  7], shape=(10,), dtype=int32)


In [9]:
dataset2 = dataset.map(lambda x: 2*x)
for x in dataset2: print(x)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(10, shape=(), dtype=int32)
tf.Tensor(12, shape=(), dtype=int32)
tf.Tensor(14, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(18, shape=(), dtype=int32)
tf.Tensor(20, shape=(), dtype=int32)
tf.Tensor(22, shape=(), dtype=int32)


In [10]:
dataset.apply(lambda x: x.map(lambda y: 0.2*tf.cast(y,tf.float32)))

<MapDataset element_spec=TensorSpec(shape=(), dtype=tf.float32, name=None)>

In [11]:
dataset.filter(lambda x: x<8)

<FilterDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

## 2. Datasets desde un CSV

In [18]:
dataset_path = '/content/sample_data/california_housing_*.csv'

In [21]:
dataset_files = tf.data.Dataset.list_files(dataset_path, shuffle=False)

In [22]:
k = 0
for x in dataset_files: 
  print(x)
  k += 1
  if k>5: break

tf.Tensor(b'/content/sample_data/california_housing_test.csv', shape=(), dtype=string)
tf.Tensor(b'/content/sample_data/california_housing_train.csv', shape=(), dtype=string)


In [23]:
dataset = dataset_files.interleave(
    lambda x: tf.data.TextLineDataset(x).skip(1), cycle_length=2
)

In [25]:
k=0
for x in dataset:
  print(x)
  k += 1
  if k>10: break

tf.Tensor(b'-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000', shape=(), dtype=string)
tf.Tensor(b'-114.310000,34.190000,15.000000,5612.000000,1283.000000,1015.000000,472.000000,1.493600,66900.000000', shape=(), dtype=string)
tf.Tensor(b'-118.300000,34.260000,43.000000,1510.000000,310.000000,809.000000,277.000000,3.599000,176500.000000', shape=(), dtype=string)
tf.Tensor(b'-114.470000,34.400000,19.000000,7650.000000,1901.000000,1129.000000,463.000000,1.820000,80100.000000', shape=(), dtype=string)
tf.Tensor(b'-117.810000,33.780000,27.000000,3589.000000,507.000000,1484.000000,495.000000,5.793400,270500.000000', shape=(), dtype=string)
tf.Tensor(b'-114.560000,33.690000,17.000000,720.000000,174.000000,333.000000,117.000000,1.650900,85700.000000', shape=(), dtype=string)
tf.Tensor(b'-118.360000,33.820000,28.000000,67.000000,15.000000,49.000000,11.000000,6.135900,330000.000000', shape=(), dtype=string)
tf.Tensor(b'-114.570000,33.640000,14.

Preprocesamiento

In [36]:
X_mean = 0
X_std = 1
n_inputs = 9
tensor_ref = [0.]*n_inputs 
def preprocess(line):
  data = tf.io.decode_csv(line,tensor_ref)
  x = tf.stack(data[:-1])
  y = tf.stack(data[-1:])
  return (x - X_mean)/X_std, y

In [37]:
preprocess(b'-122.050000,37.370000,27.000000,3885.000000,661.000000,1537.000000,606.000000,6.608500,344700.000000')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-122.05  ,   37.37  ,   27.    , 3885.    ,  661.    , 1537.    ,
         606.    ,    6.6085], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([344700.], dtype=float32)>)

In [45]:
def read_csv(dataset_path, batch_size=128, n_files=10, repeat=1):
  dataset_files = tf.data.Dataset.list_files(dataset_path, shuffle=False)
  dataset = dataset_files.interleave(
    lambda x: tf.data.TextLineDataset(x).skip(1), cycle_length=2
  )
  dataset = dataset.map(preprocess, )
  dataset = dataset.shuffle(10*batch_size, reshuffle_each_iteration=True).repeat(repeat)
  return dataset.batch(batch_size,drop_remainder=True).prefetch(2)

In [39]:
california_dataset = read_csv('./sample_data/california_housing_*')

In [46]:
for x,y in california_dataset.take(5):
  print(x.shape, y.shape)

(128, 8) (128, 1)
(128, 8) (128, 1)
(128, 8) (128, 1)
(128, 8) (128, 1)
(128, 8) (128, 1)


Prueba con Keras

In [47]:
from tensorflow.keras import layers, models

In [48]:
model = models.Sequential()
model.add(layers.Dense(128, input_shape=(8,), activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(32, activation='linear'))
model.add(layers.Dense(1, activation='linear'))

In [51]:
model.compile(optimizer='adam',loss='mse',metrics=['mse'])
dataset = california_dataset.take(100)

In [52]:
model.fit(dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5bf4670450>

## 2. Formato TFRecord

In [57]:
# Generacion de un TFRecord
with tf.io.TFRecordWriter('record1.tfrecord') as file:
  file.write(b'Hello world from TFRecordWriter')
  file.write(b'Second line')

In [58]:
tfrecord_path = ['record1.tfrecord']
dataset = tf.data.TFRecordDataset(tfrecord_path)

In [59]:
for x in dataset: print(x)

tf.Tensor(b'Hello world from TFRecordWriter', shape=(), dtype=string)
tf.Tensor(b'Second line', shape=(), dtype=string)


In [61]:
# Compresion
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter('record1.tfrecord', options) as file:
  file.write(b'Hello world from TFRecordWriter')
  file.write(b'Second line')

In [64]:
tfrecord_path = ['record1.tfrecord']
dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type='GZIP')
for x in dataset: print(x)

tf.Tensor(b'Hello world from TFRecordWriter', shape=(), dtype=string)
tf.Tensor(b'Second line', shape=(), dtype=string)


## 3. Funciones adicionales

Normalizacion

In [66]:
import numpy as np
class StandardScaler(tf.keras.layers.Layer):
  def __init__(self, data, **kwargs):
    super().__init__(**kwargs)
    self.fit_params(data)
  def fit_params(self, data):
    self.mean = np.mean(data, axis=0, keepdims=True)
    self.std = np.std(data, axis=0, keepdims=True)
  def call(self, inputs):
    return (inputs - self.mean)/(self.std + tf.keras.backend.epsilon())

One-hot encoder

In [77]:
cats = ['cat1', 'cat2', 'cat3']
idx = tf.range(len(cats), dtype=tf.int64)
table = tf.lookup.KeyValueTensorInitializer(cats,idx)
table

<tensorflow.python.ops.lookup_ops.KeyValueTensorInitializer at 0x7f5bf425a6d0>

In [78]:
oov = 2
table = tf.lookup.StaticVocabularyTable(table,oov)

In [72]:
table

<tensorflow.python.ops.lookup_ops.StaticVocabularyTable at 0x7f5bf42e1a50>

In [75]:
# Aplicacion
categories = tf.constant(['cat1', 'cat1', 'cat3', 'cat2'])
cats_idx = table.lookup(categories)
cats_idx

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 0, 2, 1])>

In [79]:
cats_ohe = tf.one_hot(cats_idx, depth=len(cats)+oov)

In [80]:
cats_ohe

<tf.Tensor: shape=(4, 5), dtype=float32, numpy=
array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.]], dtype=float32)>

## 4. Pipelines

In [86]:
! pip install tensorflow-transform

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [87]:
import tensorflow_transform as tft

In [None]:
def preprocess(inputs):
  x1 = inputs[:-2]
  x2 = inputs[-1]
  y = inputs[-2]
  x1_mean = tft.scale_to_z_score(x1)
  x2_process = tft.compute_and_apply_vocabulary(x2)
  return x1_mean, x2_process