In [2]:
pip install tensorflow-data-validation

Collecting tensorflow-data-validation
  Downloading tensorflow_data_validation-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.0/19.0 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting pandas<2,>=1.0 (from tensorflow-data-validation)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow<11,>=10 (from tensorflow-data-validation)
  Downloading pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.9/35.9 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyfarmhash<0.4,>=0.2.2 (from tensorflow-data-validation)
  Downloading pyfarmhash-0.3.2.tar.gz (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import tensorflow_data_validation as tfdv

In [5]:
import tensorflow as tf

In [8]:
TRAIN_DATA='/content/sample_data/data/titanic_train.csv'
TEST_DATA='/content/sample_data/data/titanic_test.csv'

In [9]:
# Generate statistics for training data
train_stats=tfdv.generate_statistics_from_csv(TRAIN_DATA)
tfdv.visualize_statistics(train_stats)

In [26]:
# Infer schema from training data
schema=tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'PassengerId',INT,required,,-
'HomePlanet',STRING,optional,single,'HomePlanet'
'CryoSleep',STRING,optional,single,'CryoSleep'
'Cabin',BYTES,optional,single,-
'Destination',STRING,optional,single,'Destination'
'Age',FLOAT,optional,single,-
'VIP',STRING,optional,single,'VIP'
'RoomService',FLOAT,optional,single,-
'FoodCourt',FLOAT,optional,single,-
'ShoppingMall',FLOAT,optional,single,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'HomePlanet',"'Earth', 'Europa', 'Mars'"
'CryoSleep',"'False', 'True'"
'Destination',"'55 Cancri e', 'PSO J318.5-22', 'TRAPPIST-1e'"
'VIP',"'False', 'True'"
'Transported',"'False', 'True'"


In [15]:
# Generate statistics for evaluation data

ANOMALOUS_DATA = '/content/sample_data/data/titanic_test_anomalies.csv'

eval_stats=tfdv.generate_statistics_from_csv(ANOMALOUS_DATA)

tfdv.visualize_statistics(lhs_statistics = train_stats, rhs_statistics = eval_stats,
                          lhs_name = "Training Data", rhs_name = "Evaluation Data")

In [16]:
# Identifying Anomalies
anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Transported',Column dropped,Column is completely missing
'Destination',Unexpected string values,Examples contain values missing from the schema: Anomaly (<1%).
'RoomService',Column dropped,Column is completely missing
'FoodCourt',Unexpected data type,Expected data of type: FLOAT but got INT
'Age',Unexpected data type,Expected data of type: FLOAT but got INT
'ShoppingMall',Unexpected data type,Expected data of type: FLOAT but got INT
'CryoSleep',Unexpected string values,"Examples contain values missing from the schema: FALSE (~63%), TRUE (~36%)."
'VRDeck',Unexpected data type,Expected data of type: FLOAT but got INT
'Spa',Unexpected data type,Expected data of type: FLOAT but got INT
'VIP',Unexpected string values,"Examples contain values missing from the schema: FALSE (~98%), TRUE (~1%)."


In [29]:
# Fixing Anomalies: Either change the evaluation data (manually) or modify schema
# Modifying Schema with the changes that are acceptable

# Adding new value for 'Destination'
destination_domain=tfdv.get_domain(schema, 'Destination')
destination_domain.value.append('Anomaly')

anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Transported',Column dropped,Column is completely missing
'VIP',Unexpected string values,"Examples contain values missing from the schema: FALSE (~98%), TRUE (~1%)."
'RoomService',Column dropped,Column is completely missing
'CryoSleep',Unexpected string values,"Examples contain values missing from the schema: FALSE (~63%), TRUE (~36%)."


In [30]:
# Adding data in CAPS to domain for VIP and CryoSleep

vip_domain=tfdv.get_domain(schema, 'VIP')
vip_domain.value.extend(['TRUE','FALSE'])

# Setting domain of one feature to another
tfdv.set_domain(schema, 'CryoSleep', vip_domain)

anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)



Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Transported',Column dropped,Column is completely missing
'RoomService',Column dropped,Column is completely missing


In [31]:
# INT can be safely converted to FLOAT. So we can safely ignore it and ask TFDV to use schema

options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
eval_stats=tfdv.generate_statistics_from_csv(ANOMALOUS_DATA, stats_options=options)

anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Transported',Column dropped,Column is completely missing
'RoomService',Column dropped,Column is completely missing


In [33]:
# Transported is the class label and will not be available in Evaluation data.
# To indicate that we set two environments; Training and Serving

schema.default_environment.append('Training')
schema.default_environment.append('Serving')

tfdv.get_feature(schema, 'Transported').not_in_environment.append('Serving')

serving_anomalies_with_environment=tfdv.validate_statistics(
    statistics=eval_stats, schema=schema, environment='Serving')

tfdv.display_anomalies(serving_anomalies_with_environment)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'RoomService',Column dropped,Column is completely missing


In [None]:
# We are good here. Room Service is the missing column in Evaluation data. This
# cannot be fixed. Hence we need to come up with manual techniques to handle
# this issue

In [None]:
# Checking for Drifts using L-Infinity Distance

In [35]:
serving_stats = tfdv.generate_statistics_from_csv(TEST_DATA)

In [41]:
# Skew Comparator
spa_analyze=tfdv.get_feature(schema, 'Spa')
spa_analyze.skew_comparator.infinity_norm.threshold=0.01

# Drift Comparator
CryoSleep_analyze=tfdv.get_feature(schema, 'CryoSleep')
CryoSleep_analyze.drift_comparator.infinity_norm.threshold=0.01

skew_anomalies=tfdv.validate_statistics(statistics=train_stats, schema=schema,
                                        previous_statistics=eval_stats,
                                        serving_statistics=serving_stats)
tfdv.display_anomalies(skew_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'CryoSleep',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.641694 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: False"


In [None]:
# Requires retraining, as we observe drifts

In [42]:
from tensorflow.python.lib.io import file_io
from google.protobuf import text_format

file_io.recursive_create_dir('schema')
schema_file = os.path.join('schema', 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)

In [43]:
!cat {schema_file}

feature {
  name: "PassengerId"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "HomePlanet"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "HomePlanet"
  presence {
    min_count: 1
  }
}
feature {
  name: "CryoSleep"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  string_domain {
    name: "VIP"
    value: "False"
    value: "True"
    value: "TRUE"
    value: "FALSE"
  }
  presence {
    min_count: 1
  }
  drift_comparator {
    infinity_norm {
      threshold: 0.01
    }
  }
}
feature {
  name: "Cabin"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  presence {
    min_count: 1
  }
}
feature {
  name: "Destination"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "Destination"
  presence {
    min_count: 1
  }
}
feature {
  name: "Age"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "

In [44]:
loaded_schema= tfdv.load_schema_text(schema_file)
loaded_schema

feature {
  name: "PassengerId"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "HomePlanet"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "HomePlanet"
  presence {
    min_count: 1
  }
}
feature {
  name: "CryoSleep"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  string_domain {
    name: "VIP"
    value: "False"
    value: "True"
    value: "TRUE"
    value: "FALSE"
  }
  presence {
    min_count: 1
  }
  drift_comparator {
    infinity_norm {
      threshold: 0.01
    }
  }
}
feature {
  name: "Cabin"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  presence {
    min_count: 1
  }
}
feature {
  name: "Destination"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "Destination"
  presence {
    min_count: 1
  }
}
feature {
  name: "Age"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "