In [None]:
### All modeuls

## generic
import os
import tensorflow as tf
import pandas as pd
from tensorflow.python.lib.io import file_io # for schema freezing
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList, DatasetFeatureStatistics
# DatasetFeatureStatisticsList is to make the type of DatasetFeatureStatisticsList for visualize_statistics
import tempfile
from google.protobuf.json_format import MessageToDict



## Tensorflow Extended Overall
from tfx import v1 as tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext


## TF Data validation
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils import slicing_util # to split the data for further representativeness verifications


## TF Transform
import tensorflow_transform as tft
import tensorflow_transform.beam as tft_beam
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import schema_utils

from sklearn.model_selection import train_test_split
from util import add_extra_rows

from tensorflow_metadata.proto.v0 import schema_pb2

![title](TFX_comp11.png)

In [None]:
######################################################################################
### Data Validation not belonging to StatisticsGen SchemaGen and Example Validator ###
######################################################################################

### Subject 1: to create statistics (descriptive) from a dataframe

train_stats = tfdv.generate_statistics_from_dataframe(train_df) # or eval_df or any dataframe (pandas)
# train_stats has the DatasetFeatureStatisticsList type (required for visualize_statistics coming in the next line)

## if you have only one set
tfdv.visualize_statistics(train_stats)
## if you have two sets to compare
tfdv.visualize_statistics(
    lhs_statistics=eval_stats, 
    rhs_statistics=train_stats, 
    lhs_name='EVAL_DATASET', 
    rhs_name='TRAIN_DATASET'
)

### Subject 2: To create and display schema from a df_stats

tfdv.infer_schema(statistics = train_stats)
tfdv.display_schema(schema)


### Subject 3: To capture (and display) anolamies (outliers) (done by validate_statistics)

anomalies = tfdv.validate_statistics(statistics = eval_stats, schema = schema) # eval_stats makes sense since schema is defined for train_stats
tfdv.display_anomalies(anomalies)

## If you want to change minimum fraction percentage to a value lower than the default 1 %
example_feature_name = tfdv.get_feature(schema, 'example_feature_name')
example_feature_name.distribution_constraints.min_domain_mass = 0.9 # any value between 0 and 1
# This is good when you have multiple categories with small fractions

## If you want to include a new category that is captured as anomaly due to low fraction
example_feature_name = tfdv.get_domain(schema, 'example_feature_name')
race_domain.value.append('Example_category')

## If we want to set the range for a continous variable]
tfdv.set_domain(schema, 'cont_var', schema_pb2.IntDomain(name = 'cont_var', min = 17, max = 90)) # 17 and 90 considered as minimum and maximum examples

## See schema after all updates (updates performed inplace) and check anomalies
tfdv.display_schema(schema)
new_anomalies = tfdv.validate_statistics(statistics = eval_stats, schema = schema)
tfdv.display_anomalies(new_anomalies)

### Subject 4: Splitting the data per category to verify the representativeness (e.g., sex)

slice_fn = slicing_util.get_feature_value_slicer(features = {'sex': None}) # making a slicing function. What if we change None?
# need to make a stats options to make the above line workable
slice_stats_options = tfdv.StatsOptions(schema = schema,
                                           slice_functions = [slice_fn],
                                           infer_type_from_schema = True) 

# If we want to pring all the fueatures
for feature in stats_options.feature_allowlist:
    print(feature)

# This can also have feature_allowlist with a list of columns to be included in the model (see the Assignment)
train_df.to_csv(CSV_PATH) # CSV_PATH is an example of teh csv file required for the line below
sliced_stats = tfdv.generate_statistics_from_csv(CSV_PATH, stats_options = slice_stats_options) # requires a CSV file 
# sliced_stats and train_stats (at the beginning of the code) have the same type (DatasetFeatureStatisticsList)
# sliced_stats has different components the most famous of which is datasets which is like a dictionary
# this dictionary values are like dictionaries as well (See Lab)
# for example the following line gives the name of slices.
[sliced.name for sliced in sliced_stats.datasets]

## To visualize for sliced datasets
male_stats_list = DatasetFeatureStatisticsList() # creates a type of DatasetFeatureStatisticsList required for visualize_statistics
male_stats_list.datasets.extend([sliced_stats.datasets[1]]) # fill it with sliced datasets
male_stats_name = sliced_stats.datasets[1].name

female_stats_list = DatasetFeatureStatisticsList()
female_stats_list.datasets.extend([sliced_stats.datasets[2]])
demale_stats_name = sliced_stats.datasets[2].name

tfdv.visualize_statistics(
    ils_statistics = male_stats_list,
    rhs_statistics = female_stats_list,
    lhs_name = male_stats_name,
    rhs_name = female_stats_name
    )


### Subject 5: Skewness & Drift Capturing in datasets

# to calculate skew and drift from a feature
example_feature = tfdv.get_feature(schema, 'example_feature')
example_feature.skew_comparator.infinity_norm.threshold = 0.03
example_feature.drift_comparator.infinity_norm.threshold = 0.03

# to calculate (and display) anomalies based on skew and drift
skew_drift_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

tfdv.display_anomalies(skew_drift_anomalies)


### Sunject 6: Schema Freezing
OUTPUT_DIR = 'output'
file_io.recursive_create_dir(OUTPUP_DIR)
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file) # two arguments: schema (you know it), and the file which store schema (schema_file)

![title](TFX_comp_transform.png)

In [None]:
############################
### Tensorflow Transform ###
############################

### Subject 1: Defining a schema using DatasetMetadata
raw_data_metadata = dataset_metadata.DatasetMetadata(
    schema_utils.schema_from_feature_spec({
        'y': tf.io.FixedLenFeature([], tf.float32),
        'x': tf.io.FixedLenFeature([], tf.float32),
        's': tf.io.FixedLenFeature([], tf.string),
    }))


### Subject 2: To create a preprocessing function that transform/scale raw data
def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    
    # extract the columns and assign to local variables
    x = inputs['x']
    y = inputs['y']
    s = inputs['s']
    
    # data transformations using tft functions
    x_centered = x - tft.mean(x)
    y_normalized = tft.scale_to_0_1(y)
    s_integerized = tft.compute_and_apply_vocabulary(s)
    x_centered_times_y_normalized = (x_centered * y_normalized)
    
    # return the transformed data
    return {
        'x_centered': x_centered,
        'y_normalized': y_normalized,
        's_integerized': s_integerized,
        'x_centered_times_y_normalized': x_centered_times_y_normalized,
    }

## Note the input data should have the form of the follwin g format: a list of dictionaries (for input data, or DatasetMetadata for schema)

raw_data = [
      {'x': 1, 'y': 1, 's': 'hello'},
      {'x': 2, 'y': 2, 's': 'world'},
      {'x': 3, 'y': 3, 's': 'hello'}
  ]

# DatasetMetadata looks like a dictionary of dictionaries. The only key is 'schema' and the value is the features with their schema info
# this is how it looks like:

'''
{'_schema': feature {
  name: "s"
  type: BYTES
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "x"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
feature {
  name: "y"
  type: FLOAT
  presence {
    min_fraction: 1.0
  }
  shape {
  }
}
}
'''


### Subject 3: Implementing the the preprocessing function to the raw_data

## To ignore the warnings
tf.get_logger().setLevel('ERROR')

## To implement the preprocessing function to raw_data
with tft_beam.Context(temp_dir = tempfile.mkdtemp()):
    
    # define the pipeline using Apache Beam syntax
    transformed_dataset, transform_fn = (
        
        # analyze and transform the dataset using the preprocessing function
        (raw_data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset(
            preprocessing_fn)
    )

# unpack the transformed dataset
transformed_data, transformed_metadata = transformed_dataset # the transformed data has a format of list (of dictionaries) like raw_data

## Each of transformed_datasets and transform_fn have two components returned by the AnalyzeAndTransformDataset using preprocessing_fn
## It does the work of both AnalyzeDataset and TransformDataset (it is a combination performer) each return one value



### Subject 4: Another way to use TFT (afterr SchemaGen, StatsitcsGen and Example Validator). See the next two cells

# Set the constants module filename
_census_constants_module_file = 'census_constants.py'


In [None]:
%%writefile {_census_constants_module_file}

# Features with string data types that will be converted to indices
CATEGORICAL_FEATURE_KEYS = [
    'education', 'marital-status', 'occupation', 'race', 'relationship', 'workclass', 'sex', 'native-country'
]

# Numerical features that are marked as continuous
NUMERIC_FEATURE_KEYS = ['fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

# Feature that can be grouped into buckets
BUCKET_FEATURE_KEYS = ['age']

# Number of buckets used by tf.transform for encoding each bucket feature.
FEATURE_BUCKET_COUNT = {'age': 4}

# Feature that the model will predict
LABEL_KEY = 'label'

# Utility function for renaming the feature
def transformed_name(key):
    return key + '_xf'

In [None]:
# Set the transform module filename
_census_transform_module_file = 'census_transform.py'

In [None]:
%%writefile {_census_transform_module_file}

import tensorflow as tf
import tensorflow_transform as tft

import census_constants

# Unpack the contents of the constants module
_NUMERIC_FEATURE_KEYS = census_constants.NUMERIC_FEATURE_KEYS
_CATEGORICAL_FEATURE_KEYS = census_constants.CATEGORICAL_FEATURE_KEYS
_BUCKET_FEATURE_KEYS = census_constants.BUCKET_FEATURE_KEYS
_FEATURE_BUCKET_COUNT = census_constants.FEATURE_BUCKET_COUNT
_LABEL_KEY = census_constants.LABEL_KEY
_transformed_name = census_constants.transformed_name


# Define the transformations
def preprocessing_fn(inputs):
    """tf.transform's callback function for preprocessing inputs.
    Args:
        inputs: map from feature keys to raw not-yet-transformed features.
    Returns:
        Map from string feature key to transformed feature operations.
    """
    outputs = {}

    # Scale these features to the range [0,1]
    for key in _NUMERIC_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.scale_to_0_1(
            inputs[key])
    
    # Bucketize these features
    for key in _BUCKET_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.bucketize(
            inputs[key], _FEATURE_BUCKET_COUNT[key])

    # Convert strings to indices in a vocabulary
    for key in _CATEGORICAL_FEATURE_KEYS:
        outputs[_transformed_name(key)] = tft.compute_and_apply_vocabulary(inputs[key])

    # Convert the label strings to an index
    outputs[_transformed_name(_LABEL_KEY)] = tft.compute_and_apply_vocabulary(inputs[_LABEL_KEY])

    return outputs

In [None]:
# Ignore TF warning messages
tf.get_logger().setLevel('ERROR')

# Instantiate the Transform component
transform = tfx.components.Transform(
    examples=example_gen.outputs['examples'],
    schema=schema_gen.outputs['schema'],
    module_file=os.path.abspath(_census_transform_module_file))

# Run the component
context.run(transform)


# Get the uri of the transform graph
transform_graph_uri = transform.outputs['transform_graph'].get()[0].uri

# List the subdirectories under the uri
os.listdir(transform_graph_uri)

# Get the URI of the output artifact representing the transformed examples
train_uri = os.path.join(transform.outputs['transformed_examples'].get()[0].uri, 'Split-train')

# Get the list of files in this directory (all compressed TFRecord files)
tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)]

# Create a `TFRecordDataset` to read these files
transformed_dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type="GZIP")

# Get 3 records from the dataset
sample_records_xf = get_records(transformed_dataset, 3)

# Print the output
pp.pprint(sample_records_xf)

![title](TFX_comp21.png)

In [None]:
##################################################################
### Tensorflow Extended (TFX) (other then specific components) ###
##################################################################

# Initialize the InteractiveContext with a local sqlite file.
# If you leave `_pipeline_root` blank, then the db will be created in a temporary directory.
# You can safely ignore the warning about the missing config file.

# location of the pipeline metadata store
_pipeline_root = './pipeline/'

context = InteractiveContext(pipeline_root = _pipeline_root)

![title](TFX_comp_ExampleGen.png)

In [None]:
######################################
### ExampleGen from Data Ingestion ###
######################################

### Subject 1: To instantiate ExampleGen with the input CSV dataset

# directory of the raw data files
_data_root = './data/census_data'

# path to the raw training data
_data_filepath = os.path.join(_data_root, 'adult.data')

example_gen = tfx.components.CsvExampleGen(input_base = _data_root)
context.run(example_gen) # the outcome is an artifact

# to get artifact object and read from its properties
artifact = example_gen.outputs['examples'].get()[0]
print(f'split names: {artifact.split_names}')
print(f'artifact uri: {artifact.uri}')


### Subject 2: To collect all the data
## Get the URI of the output artifact representing the training examples
train_uri = os.path.join(artifact.uri, 'Split-train')

## See the contents of the `train` folder
!ls {train_uri}

tfrecord_filenames = [os.path.join(train_uri, name)
                      for name in os.listdir(train_uri)] #makes only one element list if run from the lab

## Create a `TFRecordDataset` to read these files
dataset = tf.data.TFRecordDataset(tfrecord_filenames, compression_type = "GZIP")

# dataset has data. It is just not as visible as a simple dataframe. Run the following code, then you will realize



'''
each record  of dataset looks like this (this is the first record)


{'features': 
 {'feature': 
  {'education': 
   {'bytesList': {'value': ['IEJhY2hlbG9ycw==']}}, 
   'sex': {'bytesList': {'value': ['IE1hbGU=']}}, 
   'race': {'bytesList': {'value': ['IFdoaXRl']}}, 
   'native-country': {'bytesList': {'value': ['IFVuaXRlZC1TdGF0ZXM=']}}, 
   'relationship': {'bytesList': {'value': ['IE5vdC1pbi1mYW1pbHk=']}}, 
   'marital-status': {'bytesList': {'value': ['IE5ldmVyLW1hcnJpZWQ=']}}, 
   'hours-per-week': {'int64List': {'value': ['40']}}, 
   'age': {'int64List': {'value': ['39']}}, 
   'label': {'bytesList': {'value': ['IDw9NTBL']}}, 
   'occupation': {'bytesList': {'value': ['IEFkbS1jbGVyaWNhbA==']}}, 
   'capital-gain': {'int64List': {'value': ['2174']}}, 
   'education-num': {'int64List': {'value': ['13']}}, 
   'workclass': {'bytesList': {'value': ['IFN0YXRlLWdvdg==']}}, 
   'fnlwgt': {'int64List': {'value': ['77516']}}, 
   'capital-loss': {'int64List': {'value': ['0']}}  
      }
         }
            }
    
'''    
    

![title](TFX_comp_StatGenSchemaGenExampleVal.png)

In [None]:
##########################################
### StatisticsGen from Data Validation ###
##########################################

### Subect 1: To instantiate StatisticsGen with the ExampleGen ingested dataset

statistics_gen = tfx.components.StatisticsGen(
    examples = example_gen.outputs['examples'])

# Execute the component
context.run(statistics_gen)

# Show the output statistics
context.show(statistics_gen.outputs['statistics']) # practically this is similar to tfdv.generate_statistics_from_dataframe and tfdv.visualize_statistics


######################################
### SchemaGen from Data Validation ###
######################################

### Subject 1: Instantiate SchemaGen with the StatisticsGen ingested dataset
schema_gen = tfx.components.SchemaGen(
    statistics=statistics_gen.outputs['statistics'],
    ) # Similar role to tfdv.infer_schema and tfdv.display_schema


# Run the component
context.run(schema_gen)

# Visualize the schema
context.show(schema_gen.outputs['schema'])


##############################################
### Example Validator from Data Validation ###
##############################################

### Subject 1: Instantiate ExampleValidator with the StatisticsGen and SchemaGen ingested data
example_validator = tfx.components.ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],
    schema=schema_gen.outputs['schema']) # Similar role to tfdv.validate_statistics and tfdv.display_anomalies()

# Run the component
context.run(example_validator)

# Visualize the results
context.show(example_validator.outputs['anomalies'])