# Tensorflow tutorial on loading csv data

2 main parts:

- load data off the disk
- preprocess into a form suitable for training


### Setup

In [1]:
import pandas as pd
import numpy as np

np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow.keras import layers

2023-08-02 06:54:11.714050: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-02 06:54:18.445299: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-02 06:54:18.470765: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Small datasets: load into memory as a pandas dataframe or numpy array

In [2]:
abalone_train = pd.read_csv(
    "https://storage.googleapis.com/download.tensorflow.org/data/abalone_train.csv",
    names=["Length", "Diameter", "Height", "Whole weight", "Shucked weight",
           "Viscera weight", "Shell weight", "Age"])

abalone_train.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Age
0,0.435,0.335,0.11,0.334,0.1355,0.0775,0.0965,7
1,0.585,0.45,0.125,0.874,0.3545,0.2075,0.225,6
2,0.655,0.51,0.16,1.092,0.396,0.2825,0.37,14
3,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
4,0.545,0.42,0.13,0.879,0.374,0.1695,0.23,13


task: predict age from other measurements

In [3]:
abalone_features = abalone_train.copy()
abalone_labels = abalone_features.pop('Age')

In [4]:
abalone_features = np.array(abalone_features)
abalone_features

array([[0.435, 0.335, 0.11 , ..., 0.136, 0.077, 0.097],
       [0.585, 0.45 , 0.125, ..., 0.354, 0.207, 0.225],
       [0.655, 0.51 , 0.16 , ..., 0.396, 0.282, 0.37 ],
       ...,
       [0.53 , 0.42 , 0.13 , ..., 0.374, 0.167, 0.249],
       [0.395, 0.315, 0.105, ..., 0.118, 0.091, 0.119],
       [0.45 , 0.355, 0.12 , ..., 0.115, 0.067, 0.16 ]])

Make a simple regression model

In [5]:
abalone_features.shape

(3320, 7)

In [6]:
abalone_model = tf.keras.Sequential([
    layers.Dense(64),
    layers.Dense(1)
])

abalone_model.compile(loss = tf.keras.losses.MeanSquaredError(),
                      optimizer = tf.keras.optimizers.Adam())

In [7]:
abalone_model.fit(abalone_features, abalone_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd1ffd36440>

### Preprocessing

Keras has a normalization layer

In [8]:
normalize = layers.Normalization()

Use `Normalization.adapt` to run normalization. I believe this is similar to scikit-learn's `fit` method. Put the normalize class in the sequential model to run transform

In [9]:
norm_abalone_model = tf.keras.Sequential([
    normalize,
    layers.Dense(64),
    layers.Dense(1),
])

norm_abalone_model.compile(loss = tf.keras.losses.MeanSquaredError(),
                           optimizer = tf.keras.optimizers.Adam())

norm_abalone_model.fit(abalone_features, abalone_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd1fc125ff0>

### Mixed datatypes

In [10]:
titanic = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
titanic.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [11]:
titanic_features = titanic.copy()
titanic_labels = titanic_features.pop('survived')

Now we can't simply put it in a numpy array and pass it to a sequential model.

**Benefit of Keras models: you bring the preprocessing steps with you when you save the model**

Here, we'll build a model using the functional api

This operates on "symbolic" tensors. "eager" tensors have a value, symbolic tensors do not. These symbolic tensors keep track of which operations run on them, then build a representation of the calculation

example:

In [12]:
# create a symbolic input
input = tf.keras.Input(shape=(), dtype=tf.float32)

# perform a calculation
result = 2*input + 1

# result doesn't have a value
result

<KerasTensor: shape=(None,) dtype=float32 (created by layer 'tf.__operators__.add')>

In [13]:
calc = tf.keras.Model(inputs=input, outputs=result)

In [14]:
calc(1).numpy()

3.0

In [15]:
calc(2).numpy()

5.0

We'll start by building a set of symbolic objects matching the names and datatypes of the CSV columns

In [16]:
inputs = {}

for name, column in titanic_features.items():
    dtype = column.dtype
    if dtype == object:
        dtype = tf.string
    else:
        dtype = tf.float32
    
    inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=dtype)
    
inputs

{'sex': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'sex')>,
 'age': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'age')>,
 'n_siblings_spouses': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'n_siblings_spouses')>,
 'parch': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'parch')>,
 'fare': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'fare')>,
 'class': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'class')>,
 'deck': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'deck')>,
 'embark_town': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'embark_town')>,
 'alone': <KerasTensor: shape=(None, 1) dtype=string (created by layer 'alone')>}

step 1: concatenate numeric inputs together and run through normalization layer

In [19]:
numeric_inputs = {name:input for name,input in inputs.items()
                  if input.dtype==tf.float32}

x = layers.Concatenate()(list(numeric_inputs.values()))
norm = layers.Normalization()
norm.adapt(np.array(titanic[numeric_inputs.keys()]))
all_numeric_inputs = norm(x)
all_numeric_inputs

<KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'normalization_2')>

In [20]:
preprocessed_inputs = [all_numeric_inputs]

For strings: use StringLookup to go from strings to integer indices, then use CategoryEncoding to convert to floats

In [21]:
for name, input in inputs.items():
    if input.dtype == tf.float32:
        continue
    lookup = layers.StringLookup(vocabulary=np.unique(titanic_features[name]))
    one_hot = layers.CategoryEncoding(num_tokens=lookup.vocabulary_size())
    
    x = lookup(input)
    x = one_hot(x)
    preprocessed_inputs.append(x)

now we can concatenate all them together

In [24]:
preprocessed_inputs_cat = layers.Concatenate()(preprocessed_inputs)

titanic_preprocessing = tf.keras.Model(inputs, preprocessed_inputs_cat)

tf.keras.utils.plot_model(model = titanic_preprocessing , rankdir="LR", dpi=72, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


This is a model that just contains the preprocessing. It should expect a dictionary of tensors

In [32]:
titanic_features_dict = {
    name: np.array(value)
    for name, value in titanic_features.items()
}

titanic_preprocessing(titanic_features_dict)

<tf.Tensor: shape=(627, 28), dtype=float32, numpy=
array([[-0.61 ,  0.395, -0.479, ...,  0.   ,  1.   ,  0.   ],
       [ 0.669,  0.395, -0.479, ...,  0.   ,  1.   ,  0.   ],
       [-0.29 , -0.474, -0.479, ...,  0.   ,  0.   ,  1.   ],
       ...,
       [-0.85 , -0.474, -0.479, ...,  0.   ,  0.   ,  1.   ],
       [-0.13 ,  0.395,  2.045, ...,  0.   ,  1.   ,  0.   ],
       [ 0.189, -0.474, -0.479, ...,  0.   ,  0.   ,  1.   ]],
      dtype=float32)>

zoom in on a single pass feature

In [33]:
feat_dict = {name:values[:1] for name, values in titanic_features_dict.items()}
titanic_preprocessing(feat_dict)

<tf.Tensor: shape=(1, 28), dtype=float32, numpy=
array([[-0.61 ,  0.395, -0.479, -0.497,  0.   ,  0.   ,  1.   ,  0.   ,
         0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         0.   ,  0.   ,  1.   ,  0.   ]], dtype=float32)>

build the model on top of this

In [35]:
def titanic_model(preprocessing_head, inputs):
    body = tf.keras.Sequential([
        layers.Dense(64),
        layers.Dense(1)
    ])
    
    # remember that inputs is the raw data (pandas)
    # think of `result` as the graph representing what computations
    # are to be done. It consists of the preprocessing then
    # then linear layers
    
    # This function is set up to return a model that's ready for fitting
    # given some generic preprocessing head
    preprocessed_inputs = preprocessing_head(inputs)
    result = body(preprocessed_inputs)
    model = tf.keras.Model(inputs, result)
    
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam())
    
    return model

titanic_model = titanic_model(titanic_preprocessing, inputs)

Fit the model - remember that we're giving it a dictionary of tensors

In [36]:
titanic_model.fit(x=titanic_features_dict, y=titanic_labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd1e5dfa140>

We can save the model and reloadit to get identical results

In [37]:
titanic_model.save('load_data_tutorial_model')

INFO:tensorflow:Assets written to: load_data_tutorial_model/assets


INFO:tensorflow:Assets written to: load_data_tutorial_model/assets


In [38]:
reloaded = tf.keras.models.load_model('load_data_tutorial_model')

In [39]:
features_dict = {name:values[:1] for name, values in titanic_features_dict.items()}

before = titanic_model(features_dict)
after = reloaded(features_dict)
assert (before-after)<1e-3
print(before)
print(after)

tf.Tensor([[-1.896]], shape=(1, 1), dtype=float32)
tf.Tensor([[-1.896]], shape=(1, 1), dtype=float32)


### Using tf.data

This will give us more control over the input data pipeline or use data that doesn't easily fit into memory.

Consider the following code to manually slice up the dictionary of features from the last section:L

In [40]:
import itertools

def slices(features):
    for i in itertools.count():
        # for each features take index `i`
        example = {name:values[i] for name, values in features.items()}
        yield example

now run and print first example

In [41]:
for example in slices(titanic_features_dict):
    for name, value in example.items():
        print(f"{name:19s}: {value}")
    break

sex                : male
age                : 22.0
n_siblings_spouses : 1
parch              : 0
fare               : 7.25
class              : Third
deck               : unknown
embark_town        : Southampton
alone              : n


Most basic dataset: tf.datad.Dataset.from_tensor_slices - generalized version of `slices`

In [42]:
features_ds = tf.data.Dataset.from_tensor_slices(
    titanic_features_dict
)

for example in features_ds:
    for name, value in example.items():
        print(f"{name:19s}: {value}")
    break

sex                : b'male'
age                : 22.0
n_siblings_spouses : 1
parch              : 0
fare               : 7.25
class              : b'Third'
deck               : b'unknown'
embark_town        : b'Southampton'
alone              : b'n'


can handle any structure of nested dictionaries or tuples

In [43]:
titanic_ds = tf.data.Dataset.from_tensor_slices((titanic_features_dict, titanic_labels))

to train a model: shuffle and batch the data

In [44]:
titanic_batches = titanic_ds.shuffle(len(titanic_labels)).batch(32)

now we can pass the whole dataset to the model

In [45]:
titanic_model.fit(titanic_batches, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fd1e55f5a80>

### From a single file

functions for loading csv files

In [46]:
titanic_file_path = tf.keras.utils.get_file("train.csv", "https://storage.googleapis.com/tf-datasets/titanic/train.csv")

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv


Now read the csv from the file and make a dataset

In [47]:
titanic_csv_ds = tf.data.experimental.make_csv_dataset(
    titanic_file_path,
    batch_size=5,
    label_name='survived',
    num_epochs=1,
    ignore_errors=True
)

Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


In [49]:
for batch, label in titanic_csv_ds.take(1):
    for key, value in batch.items():
        print(f"{key:20s}: {value}")
    print()
    print(f"{'label':20s}: {label}")

sex                 : [b'male' b'female' b'female' b'male' b'male']
age                 : [32. 36. 31. 50. 28.]
n_siblings_spouses  : [0 1 1 1 0]
parch               : [0 0 0 0 0]
fare                : [ 7.925 17.4   18.    55.9    7.896]
class               : [b'Third' b'Third' b'Third' b'First' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'E' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton' b'Southampton'
 b'Southampton']
alone               : [b'y' b'n' b'n' b'n' b'y']

label               : [0 1 0 0 0]


can also decompress files on the fly

In [50]:
traffic_volume_csv_gz = tf.keras.utils.get_file(
    'Metro_Interstate_Traffic_Volume.csv.gz', 
    "https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz",
    cache_dir='.', cache_subdir='traffic')

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00492/Metro_Interstate_Traffic_Volume.csv.gz
   8192/Unknown - 0s 0us/step

In [51]:
traffic_volume_csv_gz_ds = tf.data.experimental.make_csv_dataset(
    traffic_volume_csv_gz,
    batch_size=256,
    label_name='traffic_volume',
    num_epochs=1,
    compression_type="GZIP")

for batch, label in traffic_volume_csv_gz_ds.take(1):
    for key, value in batch.items():
        print(f"{key:20s}: {value[:5]}")
    print()
    print(f"{'label':20s}: {label[:5]}")

holiday             : [b'None' b'None' b'None' b'None' b'None']
temp                : [276.29 295.76 252.97 293.77 286.81]
rain_1h             : [0. 0. 0. 0. 0.]
snow_1h             : [0. 0. 0. 0. 0.]
clouds_all          : [90  0  1  0 90]
weather_main        : [b'Rain' b'Clear' b'Clear' b'Clear' b'Clouds']
weather_description : [b'light rain' b'Sky is Clear' b'sky is clear' b'Sky is Clear'
 b'overcast clouds']
date_time           : [b'2013-01-11 17:00:00' b'2013-09-25 15:00:00' b'2013-02-21 08:00:00'
 b'2013-08-10 20:00:00' b'2013-05-27 13:00:00']

label               : [5668 6000 6250 3333 3436]


### Caching

There's an overhead to parsing csv data. Can be a bottleneck for small models.

might want to use `Dataset.cache` or `tf.data.Dataset.snapshot` so csv is only parsed in first epoch

### Multiple files

In [52]:
fonts_zip = tf.keras.utils.get_file(
    'fonts.zip',  "https://archive.ics.uci.edu/ml/machine-learning-databases/00417/fonts.zip",
    cache_dir='.', cache_subdir='fonts',
    extract=True)

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00417/fonts.zip
   8192/Unknown - 0s 0us/step

In [53]:
import pathlib
font_csvs = sorted(str(p) for p in pathlib.Path('fonts').glob("*.csv"))

font_csvs[:10]

['fonts/AGENCY.csv',
 'fonts/ARIAL.csv',
 'fonts/BAITI.csv',
 'fonts/BANKGOTHIC.csv',
 'fonts/BASKERVILLE.csv',
 'fonts/BAUHAUS.csv',
 'fonts/BELL.csv',
 'fonts/BERLIN.csv',
 'fonts/BERNARD.csv',
 'fonts/BITSTREAMVERA.csv']

In [54]:
len(font_csvs)

153

You can pass a glob-style `file_pattern` to `make_csv_dataset`. use `num_parallel_reads` to set how many files are read in parallel and interleaved together

In [57]:
fonts_ds = tf.data.experimental.make_csv_dataset(
    file_pattern = "fonts/*.csv",
    batch_size=10, num_epochs=1,
    num_parallel_reads=20,
    shuffle_buffer_size=1000
)

In [58]:
for features in fonts_ds.take(1):
    for i, (name, value) in enumerate(features.items()):
        if i>15:
            break
        print(f"{name:20s}: {value}")
print('...')
print(f"[total: {len(features)} features]")

font                : [b'PRISTINA' b'BANKGOTHIC' b'NIAGARA' b'SCRIPTB' b'VLADIMIR' b'BAITI'
 b'VLADIMIR' b'EUROROMAN' b'VINER' b'BAITI']
fontVariant         : [b'PRISTINA' b'BANKGOTHIC MD BT' b'NIAGARA SOLID' b'SCRIPT MT BOLD'
 b'VLADIMIR SCRIPT' b'MONGOLIAN BAITI' b'VLADIMIR SCRIPT' b'EUROROMAN'
 b'VINER HAND ITC' b'MONGOLIAN BAITI']
m_label             : [  960  8720  8482  8364   376 12309  8250 61687  8721  8221]
strength            : [0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4]
italic              : [0 0 0 0 0 0 0 0 0 0]
orientation         : [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
m_top               : [51 27 32 37 23 24 59 36 37 32]
m_left              : [22 23 20 19 30 25 27 22 21 23]
originalH           : [28 59 27 48 78 51 18  7 58 17]
originalW           : [28 34 33 39 62 13 13 21 46 25]
h                   : [20 20 20 20 20 20 20 20 20 20]
w                   : [20 20 20 20 20 20 20 20 20 20]
r0c0                : [  1 255 255   1   1  56   1   1 148   1]
r0c1                : [  1 25

# References

https://www.tensorflow.org/tutorials/load_data/csv