In [15]:
#Check Python Version
import sys
sys.version

'3.8.2 (default, Jan 31 2023, 18:34:03) \n[GCC 12.2.0]'

In [16]:
#Upgrade pip
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.


In [17]:
#Check TF & TFX Versioning
import tensorflow as tf
print(tf.__version__)
from tfx import v1 as tfx
print(tfx.__version__)


2.11.0
1.12.0


In [26]:
#Setup Variables as examplegen_playground
import os

# Pipeline name
PIPELINE_NAME = "examplegen_playground"

# Output directory to store artifacts generated from the pipeline.
PIPELINE_ROOT = './artifacts'
# Path to a SQLite DB file to use as an MLMD storage.
METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')
# Output directory where created models from the pipeline will be exported.
SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)

# Folder path to data
DATA_ROOT = './data/'

# Global variable for ExampleGen component
EXAMPLE_GEN = tfx.components.CsvExampleGen


from absl import logging
logging.set_verbosity(logging.INFO)  # Set default logging level.

In [27]:
#Setup Path to Data
_data_filepath = './data/penguins_processed.csv'

#View first 5 lines
!head -5 {_data_filepath}

species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,0.2545454545454545,0.6666666666666666,0.15254237288135594,0.2916666666666667
0,0.26909090909090905,0.5119047619047618,0.23728813559322035,0.3055555555555556
0,0.29818181818181805,0.5833333333333334,0.3898305084745763,0.1527777777777778
0,0.16727272727272732,0.7380952380952381,0.3559322033898305,0.20833333333333334


In [77]:
#Test ExampleGen Artifact Output
def _create_pipeline(
  pipeline_name: str,
  pipeline_root: str,
  data_root: str,
  metadata_path: str
  ) -> tfx.dsl.Pipeline:
  
  # Brings data into the pipeline.
  example_gen = tfx.components.CsvExampleGen(input_base=data_root)

  components = [
    example_gen
  ]

  return tfx.dsl.Pipeline(
    pipeline_name=PIPELINE_NAME,
    pipeline_root=PIPELINE_ROOT,
    metadata_connection_config=tfx.orchestration.metadata
      .sqlite_metadata_connection_config(metadata_path),
    components=components
)

In [94]:
tfx.orchestration.LocalDagRunner().run(
  _create_pipeline(
      pipeline_name=PIPELINE_NAME,
      pipeline_root=PIPELINE_ROOT,
      data_root=DATA_ROOT,
      metadata_path=METADATA_PATH)
  )

INFO:absl:Using deployment config:
 executor_specs {
  key: "CsvExampleGen"
  value {
    beam_executable_spec {
      python_executor_spec {
        class_path: "tfx.components.example_gen.csv_example_gen.executor.Executor"
      }
    }
  }
}
custom_driver_specs {
  key: "CsvExampleGen"
  value {
    python_class_executable_spec {
      class_path: "tfx.components.example_gen.driver.FileBasedDriver"
    }
  }
}
metadata_connection_config {
  database_connection_config {
    sqlite {
      filename_uri: "metadata/examplegen_playground/metadata.db"
      connection_mode: READWRITE_OPENCREATE
    }
  }
}

INFO:absl:Using connection config:
 sqlite {
  filename_uri: "metadata/examplegen_playground/metadata.db"
  connection_mode: READWRITE_OPENCREATE
}

INFO:absl:Component CsvExampleGen is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.example_gen.csv_example_gen.component.CsvExampleGen"
  }
  id: "CsvExampleGen"
}
contexts {
  contexts {
    type {

In [99]:
train_uri = os.path.join('./artifacts/CsvExampleGen/examples/7/', 'Split-train')

tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  print(example)


features {
  feature {
    key: "body_mass_g"
    value {
      float_list {
        value: 0.2916666567325592
      }
    }
  }
  feature {
    key: "culmen_depth_mm"
    value {
      float_list {
        value: 0.6666666865348816
      }
    }
  }
  feature {
    key: "culmen_length_mm"
    value {
      float_list {
        value: 0.2545454502105713
      }
    }
  }
  feature {
    key: "flipper_length_mm"
    value {
      float_list {
        value: 0.1525423675775528
      }
    }
  }
  feature {
    key: "species"
    value {
      int64_list {
        value: 0
      }
    }
  }
}

features {
  feature {
    key: "body_mass_g"
    value {
      float_list {
        value: 0.3055555522441864
      }
    }
  }
  feature {
    key: "culmen_depth_mm"
    value {
      float_list {
        value: 0.511904776096344
      }
    }
  }
  feature {
    key: "culmen_length_mm"
    value {
      float_list {
        value: 0.2690909206867218
      }
    }
  }
  feature {
    key: "flipper

In [100]:
eval_uri = os.path.join('./artifacts/CsvExampleGen/examples/7/', 'Split-eval')

tfrecord_filenames = [os.path.join(eval_uri, name)
                      for name in os.listdir(eval_uri)]

dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

for tfrecord in dataset.take(3):
  serialized_example = tfrecord.numpy()
  example = tf.train.Example()
  example.ParseFromString(serialized_example)
  print(example)

features {
  feature {
    key: "body_mass_g"
    value {
      float_list {
        value: 0.2569444477558136
      }
    }
  }
  feature {
    key: "culmen_depth_mm"
    value {
      float_list {
        value: 0.5595238208770752
      }
    }
  }
  feature {
    key: "culmen_length_mm"
    value {
      float_list {
        value: 0.24727272987365723
      }
    }
  }
  feature {
    key: "flipper_length_mm"
    value {
      float_list {
        value: 0.1525423675775528
      }
    }
  }
  feature {
    key: "species"
    value {
      int64_list {
        value: 0
      }
    }
  }
}

features {
  feature {
    key: "body_mass_g"
    value {
      float_list {
        value: 0.5486111044883728
      }
    }
  }
  feature {
    key: "culmen_depth_mm"
    value {
      float_list {
        value: 0.773809552192688
      }
    }
  }
  feature {
    key: "culmen_length_mm"
    value {
      float_list {
        value: 0.2581818103790283
      }
    }
  }
  feature {
    key: "flippe