<a href="https://colab.research.google.com/github/carloslme/machine_learning_pipeline/blob/chapter2/Chapter_2_Introduction_to_TensorFlow_Extended.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Testing

In [None]:
!pip install tfx

In [6]:
import tensorflow_data_validation as tfdv
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam

In [11]:
from tfx.components import ExampleValidator 
from tfx.components import Evaluator 
from tfx.components import Transform
from tfx.components import CsvExampleGen

### Overview of TFX Components
The generic internals of a component are always: 
* Receive some input 
* Perform an action 
* Store the final result

In TFX terms, the three internal parts of the component are called the driver , executor , and publisher . The driver handles the querying of the metadata store. The executor performs the actions of the components. And the publisher manages the saving of the output metadata in the MetadataStore.

The inputs and outputs of the components are called artifacts . Examples of artifacts include raw input data, preprocessed data, and trained models. Each artifact is associated with metadata stored in the MetadataStore. The artifact metadata consists of an artifact type as well as artifact properties. This artifact setup guarantees that the components can exchange data effectively.

In [12]:
!rm -rf /content/PetImages/
!rm *.zip

!wget https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip
!unzip -q -d /content/ /content/kagglecatsanddogs_3367a.zip

!echo "Count images"
!ls -U /content/PetImages/Cat | wc -l
!ls -U /content/PetImages/Dog | wc -l

!echo "Reduce images for demo purposes"
!cd /content/PetImages/Cat && ls -U | head -12000 | xargs rm 
!cd /content/PetImages/Dog && ls -U | head -12000 | xargs rm 

!echo "Count images after removal"
!ls -U /content/PetImages/Cat | wc -l
!ls -U /content/PetImages/Dog | wc -l

rm: cannot remove '*.zip': No such file or directory
--2021-01-24 06:20:38--  https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip
Resolving download.microsoft.com (download.microsoft.com)... 23.53.252.195, 2600:1406:3:496::e59, 2600:1406:3:491::e59
Connecting to download.microsoft.com (download.microsoft.com)|23.53.252.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 824894548 (787M) [application/octet-stream]
Saving to: ‘kagglecatsanddogs_3367a.zip’


2021-01-24 06:20:46 (102 MB/s) - ‘kagglecatsanddogs_3367a.zip’ saved [824894548/824894548]

Count images
12501
12501
Reduce images for demo purposes
Count images after removal
501
501


In [13]:
!pip install -qU tfx

import tfx 

%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip

In [14]:
%%skip_for_export

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

This cell will be skipped during export to pipeline.


In [2]:
%%skip_for_export
%%writefile constants.py

from typing import Text

def transformed_name(key: Text) -> Text:
  """Generate the name of the transformed feature from original name."""
  return key + '_xf'

# Keys
LABEL_KEY = 'label'
INPUT_KEY = 'image/raw'

# Feature keys
RAW_FEATURE_KEYS = [INPUT_KEY]

# Constants
IMG_SIZE = 160

UsageError: Cell magic `%%skip_for_export` not found.


In [4]:
# Keys
LABEL_KEY = 'label'
INPUT_KEY = 'image/raw'

import base64
import logging
import os
import random
import re
import sys
from typing import Any, Dict, Iterable, List, Text

import absl
import apache_beam as beam
import tensorflow as tf
import tensorflow_model_analysis as tfma
import tfx
from google.protobuf import json_format
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.saved import saved_transform_io
from tensorflow_transform.tf_metadata import (dataset_metadata, dataset_schema,
                                              metadata_io, schema_utils)
from tfx import types
from tfx.components import (Evaluator, Pusher, ResolverNode, StatisticsGen,
                            Trainer)
from tfx.components.base import (base_component, base_driver, base_executor,
                                 executor_spec)
from tfx.components.example_gen import driver
from tfx.components.example_gen.base_example_gen_executor import (
    INPUT_KEY, BaseExampleGenExecutor)
from tfx.components.example_gen.component import FileBasedExampleGen
from tfx.components.example_gen.import_example_gen.component import \
    ImportExampleGen
from tfx.components.example_gen.utils import dict_to_example
from tfx.components.example_validator.component import ExampleValidator
from tfx.components.schema_gen.component import SchemaGen
from tfx.components.statistics_gen.component import StatisticsGen
from tfx.components.trainer.executor import GenericExecutor
from tfx.components.transform.component import Transform
from tfx.dsl.experimental import latest_blessed_model_resolver
from tfx.orchestration import data_types, metadata, pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner
from tfx.orchestration.experimental.interactive.interactive_context import \
    InteractiveContext
from tfx.proto import evaluator_pb2, example_gen_pb2, pusher_pb2, trainer_pb2
from tfx.types import (Channel, artifact_utils, channel_utils,
                       standard_artifacts)
from tfx.types.component_spec import ChannelParameter, ExecutionParameter
from tfx.types.standard_artifacts import Model, ModelBlessing
from tfx.utils import io_utils
from tfx.utils.dsl_utils import external_input

%load_ext tfx.orchestration.experimental.interactive.notebook_extensions.skip

ImportError: ignored

### Interactive pipelines


In [None]:
import tensorflow as tf
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

In [9]:
from tfx.utils.dsl_utils import external_input

example_gen = CsvExampleGen(input=external_input(data_path))

NameError: ignored

In [None]:
context = InteractiveContext()
from tfx.components import StatisticsGen 
statistics_gen = StatisticsGen(
     examples = example_gen.outputs['examples'])
context.run(statistics_gen)



NameError: ignored

# Apache Beam Word Count Example
How to carry out a simple data transformation using Beam.

In [None]:
!pip install -q apache_beam[gcp]

In [1]:

import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

input_file = "gs://dataflow-samples/shakespeare/kinglear.txt"
output_file = "/content/output.txt"

In [2]:

# TODO explain these lines
pipeline_options = PipelineOptions()
# pipeline_options.view_as(SetupOptions).save_main_session = True

with beam.Pipeline(options=pipeline_options) as p: # Use the context manager to define the pipeline.

    # Read the text file[pattern] into a PCollection.
    lines = p | ReadFromText(input_file)

    # Count the occurrences of each word.
    counts = (
        lines
        | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)))
                      # .with_output_types(unicode))
        | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %s' % (word, count)

    output = counts | 'Format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    output | WriteToText(output_file)

Connecting anonymously.


In [3]:
!head /content/output.txt*

KING: 243
LEAR: 236
DRAMATIS: 1
PERSONAE: 1
king: 65
of: 447
Britain: 2
OF: 15
FRANCE: 10
DUKE: 3
