### Creating a File Pattern Object Using tf.io

tf.io API is used for referencing a distributed dataset that contains files with a common naming pattern.<br>
A list of files paths and names for all the dataset files you want to read

In [1]:
import tensorflow as tf

In [2]:
base_pattern = 'dataset'
file_pattern = 'owid-covid-data-part*'

In [3]:
files = tf.io.gfile.glob(base_pattern + '/' + file_pattern)

In [4]:
files

['dataset\\owid-covid-data-part001.csv',
 'dataset\\owid-covid-data-part0010.csv',
 'dataset\\owid-covid-data-part00100.csv',
 'dataset\\owid-covid-data-part0011.csv',
 'dataset\\owid-covid-data-part0012.csv',
 'dataset\\owid-covid-data-part0013.csv',
 'dataset\\owid-covid-data-part0014.csv',
 'dataset\\owid-covid-data-part0015.csv',
 'dataset\\owid-covid-data-part0016.csv',
 'dataset\\owid-covid-data-part0017.csv',
 'dataset\\owid-covid-data-part0018.csv',
 'dataset\\owid-covid-data-part0019.csv',
 'dataset\\owid-covid-data-part002.csv',
 'dataset\\owid-covid-data-part0020.csv',
 'dataset\\owid-covid-data-part0021.csv',
 'dataset\\owid-covid-data-part0022.csv',
 'dataset\\owid-covid-data-part0023.csv',
 'dataset\\owid-covid-data-part0024.csv',
 'dataset\\owid-covid-data-part0025.csv',
 'dataset\\owid-covid-data-part0026.csv',
 'dataset\\owid-covid-data-part0027.csv',
 'dataset\\owid-covid-data-part0028.csv',
 'dataset\\owid-covid-data-part0029.csv',
 'dataset\\owid-covid-data-part003.

This list is going to be the input for the next step. <br>
**Create a streaming dataset object based on Python generators**

In [7]:
# Convert a list of CSV files into a TensorFlow dataset object
csv_dataset = tf.data.experimental.make_csv_dataset(files,
                                                    header = True,
                                                    batch_size = 5,
                                                    label_name = 'new_deaths',
                                                    num_epochs = 1)
                                                    #ignore_errors = True)
# csv_dataset = tf.data.Dataset.ignore_errors
# This is only a test. We did not clean and prepare data for use it in a Model

In [10]:
for features, target in csv_dataset.take(1):
    print("'Target': {}".format(target))
    print("'Features:'")
    for k, v in features.items():
        print("  {!r:20s}: {}".format(k, v))

'Target': [    0. 10512.    71.    15.     0.]
'Features:'
  'iso_code'          : [b'ISL' b'OWID_WRL' b'IDN' b'ARM' b'HUN']
  'continent'         : [b'Europe' b'' b'Asia' b'Asia' b'Europe']
  'location'          : [b'Iceland' b'World' b'Indonesia' b'Armenia' b'Hungary']
  'date'              : [b'2020-06-15' b'2020-04-16' b'2020-07-13' b'2020-07-17' b'2020-07-21']
  'total_cases'       : [1.810000e+03 2.042827e+06 7.569900e+04 3.355900e+04 4.347000e+03]
  'new_cases'         : [0.0000e+00 7.8896e+04 1.6810e+03 5.5400e+02 1.4000e+01]
  'total_deaths'      : [1.00000e+01 1.39682e+05 3.60600e+03 6.07000e+02 5.96000e+02]
  'total_cases_per_million': [ 5304.029   262.076   276.755 11325.127   449.984]
  'new_cases_per_million': [  0.     10.122   6.146 186.958   1.449]
  'total_deaths_per_million': [ 29.304  17.92   13.184 204.844  61.695]
  'new_deaths_per_million': [0.    1.349 0.26  5.062 0.   ]
  'new_tests'         : [b'27.0' b'' b'9062.0' b'' b'760.0']
  'total_tests'       : [b'6318

In [11]:
features, label = next(iter(csv_dataset))

In [12]:
label


<tf.Tensor: shape=(5,), dtype=float32, numpy=array([ 0.,  5., 21., 32.,  0.], dtype=float32)>

In [13]:
features

OrderedDict([('iso_code',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'GUY', b'KWT', b'PHL', b'IDN', b'NIC'], dtype=object)>),
             ('continent',
              <tf.Tensor: shape=(5,), dtype=string, numpy=
              array([b'South America', b'Asia', b'Asia', b'Asia', b'North America'],
                    dtype=object)>),
             ('location',
              <tf.Tensor: shape=(5,), dtype=string, numpy=
              array([b'Guyana', b'Kuwait', b'Philippines', b'Indonesia', b'Nicaragua'],
                    dtype=object)>),
             ('date',
              <tf.Tensor: shape=(5,), dtype=string, numpy=
              array([b'2020-05-17', b'2020-07-07', b'2020-04-10', b'2020-06-09',
                     b'2020-06-08'], dtype=object)>),
             ('total_cases',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([  117., 50644.,  4076., 32033.,  1309.], dtype=float32)>),
             ('new_cases',
              <tf.Tensor: shape=(5,)

In [14]:
features, label = next(iter(csv_dataset))

In [15]:
label

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([106.,   7.,  47.,   0.,   0.], dtype=float32)>

In [16]:
type(csv_dataset)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset