In [None]:
pip install tensorflow-data-validation

In [None]:
import tensorflow_data_validation as tfdv
import tensorflow as tf

In [None]:
TRAIN_DATA = '/content/sample_data/data/titanic_train.csv'
TEST_DATA = '/content/sample_data/data/titanic_test.csv'
ANOMALOUS_DATA = '/content/sample_data/data/titanic_test_anomalies.csv'

In [None]:
# Generate statistics for training data
train_stats=tfdv.generate_statistics_from_csv(TRAIN_DATA)
tfdv.visualize_statistics(train_stats)

In [None]:
# Infer schema from training data
schema=tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

In [None]:
# Generate statistics for evaluation data

eval_stats=tfdv.generate_statistics_from_csv(ANOMALOUS_DATA)

tfdv.visualize_statistics(lhs_statistics = train_stats, rhs_statistics = eval_stats,
                          lhs_name = "Training Data", rhs_name = "Evaluation Data")

In [None]:
# Identifying Anomalies
anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
# Fixing Anomalies: Either change the evaluation data (manually) or modify schema
# Modifying Schema with the changes that are acceptable

# Adding new value for 'Destination'
destination_domain=tfdv.get_domain(schema, 'Destination')
destination_domain.value.append('Anomaly')

anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)


In [None]:
# Adding data in CAPS to domain for VIP and CryoSleep

vip_domain=tfdv.get_domain(schema, 'VIP')
vip_domain.value.extend(['TRUE','FALSE'])

# Setting domain of one feature to another
tfdv.set_domain(schema, 'CryoSleep', vip_domain)

anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
# INT can be safely converted to FLOAT. So we can safely ignore it and ask TFDV to use schema

options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
eval_stats=tfdv.generate_statistics_from_csv(ANOMALOUS_DATA, stats_options=options)

anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
# Transported is the class label and will not be available in Evaluation data.
# To indicate that we set two environments; Training and Serving

schema.default_environment.append('Training')
schema.default_environment.append('Serving')

tfdv.get_feature(schema, 'Transported').not_in_environment.append('Serving')

serving_anomalies_with_environment=tfdv.validate_statistics(
    statistics=eval_stats, schema=schema, environment='Serving')

tfdv.display_anomalies(serving_anomalies_with_environment)

In [None]:
# We are good here. Room Service is the missing column in Evaluation data. This
# cannot be fixed. Hence we need to come up with manual techniques to handle
# this issue

In [None]:
# Checking for Drifts using L-Infinity Distance

In [None]:
serving_stats = tfdv.generate_statistics_from_csv(TEST_DATA)

In [None]:
# Skew Comparator
spa_analyze=tfdv.get_feature(schema, 'Spa')
spa_analyze.skew_comparator.infinity_norm.threshold=0.01

# Drift Comparator
CryoSleep_analyze=tfdv.get_feature(schema, 'CryoSleep')
CryoSleep_analyze.drift_comparator.infinity_norm.threshold=0.01

skew_anomalies=tfdv.validate_statistics(statistics=train_stats, schema=schema,
                                        previous_statistics=eval_stats,
                                        serving_statistics=serving_stats)
tfdv.display_anomalies(skew_anomalies)

In [None]:
# Requires retraining, as we observe drifts

In [None]:
from tensorflow.python.lib.io import file_io
from google.protobuf import text_format

file_io.recursive_create_dir('schema')
schema_file = os.path.join('schema', 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)

In [None]:
!cat {schema_file}

In [None]:
loaded_schema= tfdv.load_schema_text(schema_file)
loaded_schema