In [None]:
# Docs
# https://cloud.google.com/architecture/ml-modeling-monitoring-analyzing-training-server-skew-in-ai-platform-prediction-with-tfdv
# https://towardsdatascience.com/how-to-compare-two-or-more-distributions-9b06ee4d30bf

In [None]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils import slicing_util

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

In [None]:
# set parameters
P = ! gcloud config list --format 'value(core.project)'
PROJECT_ID = P[0]
P = ! gcloud projects list --filter="$(gcloud config get-value project)" --format="value(PROJECT_NUMBER)"
PROJECT_NUMBER = P[0]
REGION = "us-central1"
SERVICE_ACCOUNT = f"sa-vertex-pipelines@{PROJECT_ID}.iam.gserviceaccount.com"

In [None]:
#####################################################################
#
# for this exercise, generate synthetic data using sklearn
#
#####################################################################

In [None]:
# set for training and eval
n_classes=2
n_samples=10000
n_features=10
n_informative=10
n_redundant=0
n_repeated=0

# training
RANDOM_STATE_TRAIN = 34098
# eval
RANDOM_STATE_EVAL = 78392

In [None]:
# create training data
x, y = make_classification(n_classes=n_classes
                           , n_samples=n_samples
                           , n_features=n_features
                           , n_informative=n_informative
                           , n_redundant=n_redundant
                           , n_repeated=n_repeated
                           , random_state=RANDOM_STATE_TRAIN)

training_data = np.concatenate((x, y[:,None]), axis=1)
num_cols = x.shape[1]
col_names = ','.join([f"x_{i}" for i in range(num_cols)] + ['label'])
col_names

In [None]:
# save training data to csv
TRAIN_DATA = 'train.csv'
np.savetxt( TRAIN_DATA
           , training_data
           , delimiter=','
           , fmt='%f'
           , header=col_names
           , comments="")

In [None]:
# create eval data
x, y = make_classification(n_classes=n_classes
                           , n_samples=n_samples
                           , n_features=n_features
                           , n_informative=n_informative
                           , n_redundant=n_redundant
                           , n_repeated=n_repeated
                           , random_state=RANDOM_STATE_EVAL)

eval_data = np.concatenate((x, y[:,None]), axis=1)
num_cols = x.shape[1]
col_names = ','.join([f"x_{i}" for i in range(num_cols)] + ['label'])
col_names

In [None]:
# save eval data to csv
EVAL_DATA = 'eval.csv'
np.savetxt( EVAL_DATA
           , eval_data
           , delimiter=','
           , fmt='%f'
           , header=col_names
           , comments="")

In [None]:
#####################################################################
#
# compute and viz
#
#####################################################################

In [None]:
# Compute stats for training data
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
# Compute stats for evaluation data
eval_stats = tfdv.generate_statistics_from_csv(data_location=EVAL_DATA)

In [None]:
# visualize just one ds
tfdv.visualize_statistics(train_stats)

In [None]:
# Compare evaluation data with training data
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [None]:
#####################################################################
#
#  Jensen-Shannon divergence: Skew
#
#####################################################################

In [None]:
# generate schema from training data
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

In [None]:
# Check eval data for errors by validating against previously inferred schema.
anomalies = tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
# set comparators for skew (training versus serving)
x_0 = tfdv.get_feature(schema, 'x_0')
x_0.skew_comparator.jensen_shannon_divergence.threshold = 0.01

#note: serving_statistics in next line relates to skew_comparator above
skew_anomalies = tfdv.validate_statistics(train_stats, schema, serving_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)

In [None]:
#####################################################################
#
#  experiment with the threshold
#
#####################################################################

In [None]:
# Import seaborn
import seaborn as sns

In [None]:
# select a column
col_index = 8
df_train = pd.DataFrame(training_data[:,col_index], columns=['data'])
df_train["partition"] = "train"

df_eval = pd.DataFrame(eval_data[:,col_index], columns=['data'])
df_eval["partition"] = "eval"

df = pd.concat([df_train, df_eval], ignore_index=True, axis=0)
sns.histplot(data=df, x='data', hue='partition', bins=50);

In [None]:
# set comparators for skew
schema = tfdv.infer_schema(statistics=train_stats) # resets the table below
x_n = tfdv.get_feature(schema, f"x_{col_index}")
x_n.skew_comparator.jensen_shannon_divergence.threshold = 0.001

#note: serving_statistics in next line relates to skew_comparator above
skew_anomalies = tfdv.validate_statistics(train_stats, schema, serving_statistics=eval_stats)
tfdv.display_anomalies(skew_anomalies)