# Distributed Profiling of Model Features with Whylogs & Fugue

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#sns.set_style("whitegrid")
#plt.style.use('bmh')
#plt.style.use('seaborn-whitegrid')

# this allows plots to appear directly in the notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import pandas as pd

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

In [3]:
import pandas as pd

In [4]:
demo_df = pd.read_parquet('ad_demo/demo_raw_data.parquet')

## Load Model Feature and Prediction Logs

In [5]:
demo_df.head(5)

Unnamed: 0,occurred_at,model_name,version,predictions,features
0,2023-02-10 05:33:01.065,demo_model,1.0.1,59.753181,"{""feature_5"":10087.0,""feature_6"":0.5283935351,""feature_1"":0.0,""feature_3"":-16.5737745071,""featur..."
1,2023-02-10 05:05:57.562,demo_model,1.0.1,15.915874,"{""feature_5"":44.0,""feature_6"":-4.9304880877,""feature_1"":0.0,""feature_3"":-7.376023565,""feature_2""..."
2,2023-02-10 05:20:57.750,demo_model,1.0.1,25.590763,"{""feature_5"":495.0,""feature_6"":-4.5830756571,""feature_1"":1.0,""feature_3"":-7.376023565,""feature_2..."
3,2023-02-10 05:15:05.361,demo_model,1.0.1,40.450287,"{""feature_5"":3025.0,""feature_6"":-4.8169581803,""feature_1"":0.0,""feature_3"":-7.376023565,""feature_..."
4,2023-02-10 05:36:09.118,demo_model,1.0.1,24.397123,"{""feature_5"":341.0,""feature_6"":-4.2160377928,""feature_1"":1.0,""feature_3"":-7.376023565,""feature_2..."


In [6]:
demo_df.shape

(1666314, 5)

### Extract Features and Predictions from model logs

In [7]:
import json
import pandas as pd

def extract_features(df: pd.DataFrame) -> pd.DataFrame:
    json_str = "[" + (",".join(df.features)) + "]"
    feature_df = pd.DataFrame(json.loads(json_str))
    #feature_df = feature_df.reset_index(drop=True)
    return feature_df[sorted(feature_df.columns)]

In [8]:
feature_df = extract_features(demo_df)

In [9]:
feature_df.head(5)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,0.0,1.904762,-16.573775,-231.864749,10087.0,0.528394
1,0.0,0.9375,-7.376024,-354.924267,44.0,-4.930488
2,1.0,5.909091,-7.376024,-367.064925,495.0,-4.583076
3,0.0,500000000.0,-7.376024,-350.691669,3025.0,-4.816958
4,1.0,0.3448276,-7.376024,-356.517885,341.0,-4.216038


In [10]:
feature_df.shape

(1666314, 6)

In [11]:
pd.concat([demo_df[['occurred_at', 'model_name', 'version', 'predictions']], feature_df], axis=1)

Unnamed: 0,occurred_at,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,2023-02-10 05:33:01.065,demo_model,1.0.1,59.753181,0.0,1.904762e+00,-16.573775,-231.864749,10087.0,0.528394
1,2023-02-10 05:05:57.562,demo_model,1.0.1,15.915874,0.0,9.375000e-01,-7.376024,-354.924267,44.0,-4.930488
2,2023-02-10 05:20:57.750,demo_model,1.0.1,25.590763,1.0,5.909091e+00,-7.376024,-367.064925,495.0,-4.583076
3,2023-02-10 05:15:05.361,demo_model,1.0.1,40.450287,0.0,5.000000e+08,-7.376024,-350.691669,3025.0,-4.816958
4,2023-02-10 05:36:09.118,demo_model,1.0.1,24.397123,1.0,3.448276e-01,-7.376024,-356.517885,341.0,-4.216038
...,...,...,...,...,...,...,...,...,...,...
1666309,2023-02-21 05:02:40.741,demo_model,1.0.1,21.680845,1.0,7.692308e+00,13.291135,-368.704926,561.0,-5.019625
1666310,2023-02-21 05:23:18.348,demo_model,1.0.1,65.978500,1.0,2.647059e+00,13.291135,-264.067607,10637.0,-1.634199
1666311,2023-02-21 05:10:00.133,demo_model,1.0.1,25.258211,1.0,8.437500e+00,16.573775,-222.205785,44.0,0.279922
1666312,2023-02-21 05:28:14.204,demo_model,1.0.1,64.243332,0.0,3.181818e+00,13.291135,-261.132746,10230.0,-1.482410


In [12]:
import json
import pandas as pd

def extract_features(model_logs_df: pd.DataFrame) -> pd.DataFrame:
    json_str = "[" + (",".join(model_logs_df.features)) + "]"
    feature_df = pd.DataFrame(json.loads(json_str))
    feature_df = feature_df[sorted(feature_df.columns)]
    #feature_df = feature_df.reset_index(drop=True)
    model_logs_df['occurred_at'] = model_logs_df['occurred_at'].apply(lambda x: x.replace(microsecond=0))
    model_logs_df['ds'] = model_logs_df['occurred_at'].apply(lambda x: x.strftime("%Y-%m-%d"))
    model_logs_df['hour'] = model_logs_df['occurred_at'].apply(lambda x: x.hour)
    return pd.concat([model_logs_df[['occurred_at', 'ds', 'hour', 'model_name', 'version', 'predictions']], feature_df], axis=1)

In [13]:
features_df = extract_features(demo_df)

In [14]:
features_df.head(5)

Unnamed: 0,occurred_at,ds,hour,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,2023-02-10 05:33:01,2023-02-10,5,demo_model,1.0.1,59.753181,0.0,1.904762,-16.573775,-231.864749,10087.0,0.528394
1,2023-02-10 05:05:57,2023-02-10,5,demo_model,1.0.1,15.915874,0.0,0.9375,-7.376024,-354.924267,44.0,-4.930488
2,2023-02-10 05:20:57,2023-02-10,5,demo_model,1.0.1,25.590763,1.0,5.909091,-7.376024,-367.064925,495.0,-4.583076
3,2023-02-10 05:15:05,2023-02-10,5,demo_model,1.0.1,40.450287,0.0,500000000.0,-7.376024,-350.691669,3025.0,-4.816958
4,2023-02-10 05:36:09,2023-02-10,5,demo_model,1.0.1,24.397123,1.0,0.3448276,-7.376024,-356.517885,341.0,-4.216038


In [15]:
features_df.tail(5)

Unnamed: 0,occurred_at,ds,hour,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
1666309,2023-02-21 05:02:40,2023-02-21,5,demo_model,1.0.1,21.680845,1.0,7.692308,13.291135,-368.704926,561.0,-5.019625
1666310,2023-02-21 05:23:18,2023-02-21,5,demo_model,1.0.1,65.9785,1.0,2.647059,13.291135,-264.067607,10637.0,-1.634199
1666311,2023-02-21 05:10:00,2023-02-21,5,demo_model,1.0.1,25.258211,1.0,8.4375,16.573775,-222.205785,44.0,0.279922
1666312,2023-02-21 05:28:14,2023-02-21,5,demo_model,1.0.1,64.243332,0.0,3.181818,13.291135,-261.132746,10230.0,-1.48241
1666313,2023-02-21 05:12:56,2023-02-21,5,demo_model,1.0.1,19.288034,1.0,8.431373,16.573775,-222.481718,2321.0,0.279922


In [16]:
features_df.dtypes

occurred_at    datetime64[ns]
ds                     object
hour                    int64
model_name             object
version                object
predictions           float32
feature_1             float64
feature_2             float64
feature_3             float64
feature_4             float64
feature_5             float64
feature_6             float64
dtype: object

In [17]:
len(features_df.ds.unique())

88

In [18]:
features_df.hour.unique()

array([ 5,  1,  7, 11, 10, 18,  6, 20,  2, 21,  0, 23,  8,  3, 17, 22, 16,
       12, 15, 19,  4,  9, 13, 14])

In [19]:
features_df[(features_df['ds'] == '2023-02-10') & (features_df['hour'] == 5)]

Unnamed: 0,occurred_at,ds,hour,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6
0,2023-02-10 05:33:01,2023-02-10,5,demo_model,1.0.1,59.753181,0.0,1.904762e+00,-16.573775,-231.864749,10087.0,0.528394
1,2023-02-10 05:05:57,2023-02-10,5,demo_model,1.0.1,15.915874,0.0,9.375000e-01,-7.376024,-354.924267,44.0,-4.930488
2,2023-02-10 05:20:57,2023-02-10,5,demo_model,1.0.1,25.590763,1.0,5.909091e+00,-7.376024,-367.064925,495.0,-4.583076
3,2023-02-10 05:15:05,2023-02-10,5,demo_model,1.0.1,40.450287,0.0,5.000000e+08,-7.376024,-350.691669,3025.0,-4.816958
4,2023-02-10 05:36:09,2023-02-10,5,demo_model,1.0.1,24.397123,1.0,3.448276e-01,-7.376024,-356.517885,341.0,-4.216038
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2023-02-10 05:35:44,2023-02-10,5,demo_model,1.0.1,17.575371,1.0,7.142857e-01,-16.573775,-223.450658,88.0,1.084640
996,2023-02-10 05:37:26,2023-02-10,5,demo_model,1.0.1,22.727821,1.0,4.761905e-01,-7.376024,-356.616071,1034.0,-4.216038
997,2023-02-10 05:12:26,2023-02-10,5,demo_model,1.0.1,24.809574,1.0,8.437500e+00,-7.376024,-367.860209,2519.0,-4.862648
998,2023-02-10 05:12:02,2023-02-10,5,demo_model,1.0.1,38.028652,0.0,0.000000e+00,-7.376024,-285.100373,4444.0,-1.694700


### Generate Whylogs Profiles

In [20]:
import json
import numpy as np

import whylogs as why
from whylogs import DatasetProfileView

In [21]:
feb_test_df = features_df[(features_df['ds'] == '2023-02-10') & (features_df['hour'] == 5)]

In [22]:
feb_whylogs_prof = why.log(feb_test_df[['feature_5', 'feature_6']]).view()

In [23]:
mar_test_df = features_df[(features_df['ds'] == '2023-03-10') & (features_df['hour'] == 5)]

In [24]:
mar_whylogs_prof = why.log(mar_test_df[['feature_5', 'feature_6']]).view()

In [25]:
feb_whylogs_prof.to_pandas()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,distribution/max,distribution/mean,distribution/median,distribution/min,distribution/n,distribution/q_01,distribution/q_05,distribution/q_10,distribution/q_25,distribution/q_75,distribution/q_90,distribution/q_95,distribution/q_99,distribution/stddev,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
feature_5,425.569275,420.134311,431.139744,0,1000,0,0,38148.0,2184.743,715.0,0.0,1000,11.0,22.0,33.0,55.0,3190.0,6248.0,9141.0,14388.0,3310.736242,SummaryType.COLUMN,0,1000,0,0,0,0
feature_6,233.000134,233.0,233.011768,0,1000,0,0,1.785209,-1.798066,-1.543214,-6.845155,1000,-5.256787,-4.975246,-4.793975,-4.1657,0.373155,1.207462,1.48241,1.724903,2.350622,SummaryType.COLUMN,0,1000,0,0,0,0


### Visualize Whylogs Profiles

In [26]:
from whylogs.viz import NotebookProfileVisualizer

from whylogs.viz.utils.histogram_calculations import histogram_from_view
from whylogs.viz.utils.frequent_items_calculations import frequent_items_from_view

In [27]:
visualization = NotebookProfileVisualizer()
visualization.set_profiles(target_profile_view=feb_whylogs_prof, reference_profile_view=mar_whylogs_prof)

In [28]:
visualization.double_histogram(feature_name="feature_6")

### Serialize Whylogs Profiles

In [29]:
feb_whylogs_prof.serialize()[0:100]

b'WHY1\x00\xc2\x02\n\x0e \xaf\x9a\xa1\xfb\xf70(\xaf\x9a\xa1\xfb\xf70\x12\x10\n\tfeature_5\x12\x03\n\x01\x00\x12\x11\n\tfeature_6\x12\x04\n\x02\xa6P \xd4\x97\x01*\x10\x08\x0b\x12\x0ctypes/object*\x14\x08\x05\x12\x10distribution'

### Generate Hourly Profiles using Fugue

In [30]:
import json
import pandas as pd

def profile_features(features_df: pd.DataFrame) -> pd.DataFrame:
    #json_str = "[" + (",".join(df.features_json)) + "]"
    #fdf = pd.DataFrame(json.loads(json_str))
    #pdf = df.reset_index(drop=True)
    features_buf = why.log(features_df[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6']]).view().serialize()
    predictions_buf = why.log(features_df[['predictions']]).view().serialize()
    features_df.drop(['occurred_at'], axis=1, inplace=True)
    return features_df.head(1).assign(features_profile=features_buf, predictions_profile = predictions_buf, sample_records=len(features_df))

In [31]:
feb_test_df.shape

(1000, 12)

In [32]:
profile_features(feb_test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features_df.drop(['occurred_at'], axis=1, inplace=True)


Unnamed: 0,ds,hour,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,features_profile,predictions_profile,sample_records
0,2023-02-10,5,demo_model,1.0.1,59.753181,0.0,1.904762,-16.573775,-231.864749,10087.0,0.528394,b'WHY1\x00\x92\x03\n\x0e \xee\x9e\xa1\xfb\xf70(\xee\x9e\xa1\xfb\xf70\x12\x12\n\tfeature_5\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \x84\x9f\xa1\xfb\xf70(\x84\x9f\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,1000


In [33]:
from fugue import transform

hourly_feature_profile_df = transform(
    df=features_df, 
    using=profile_features, 
    schema="*-occurred_at+features_profile:binary,predictions_profile:binary,sample_records:long",
    partition=dict(by=['ds', 'hour', 'model_name', 'version']), 
    engine=None
)

In [34]:
hourly_feature_profile_df

Unnamed: 0,ds,hour,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,features_profile,predictions_profile,sample_records
0,2023-01-01,0,demo_model,1.0.1,28.874601,0.0,5.000000e+08,-13.291135,-303.546123,957.0,-6.954191,b'WHY1\x00\x92\x03\n\x0e \xfe\xad\xa1\xfb\xf70(\xfe\xad\xa1\xfb\xf70\x12\x10\n\tfeature_1\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \x8c\xae\xa1\xfb\xf70(\x8c\xae\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,1000
1,2023-01-01,1,demo_model,1.0.1,34.759167,1.0,2.619048e+00,-13.291135,-226.094488,1111.0,-5.675494,b'WHY1\x00\x92\x03\n\x0e \x91\xae\xa1\xfb\xf70(\x91\xae\xa1\xfb\xf70\x12\x12\n\tfeature_5\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \x9e\xae\xa1\xfb\xf70(\x9e\xae\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,1000
2,2023-01-01,2,demo_model,1.0.1,31.434237,0.0,5.000000e+08,-13.291135,-257.492325,1969.0,-3.698853,b'WHY1\x00\x92\x03\n\x0e \xa3\xae\xa1\xfb\xf70(\xa3\xae\xa1\xfb\xf70\x12\x12\n\tfeature_4\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xb0\xae\xa1\xfb\xf70(\xb0\xae\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,1000
3,2023-01-01,3,demo_model,1.0.1,26.973177,0.0,0.000000e+00,-13.291135,-260.120910,990.0,-3.830946,b'WHY1\x00\x92\x03\n\x0e \xb5\xae\xa1\xfb\xf70(\xb5\xae\xa1\xfb\xf70\x12\x12\n\tfeature_3\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xc2\xae\xa1\xfb\xf70(\xc2\xae\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,1000
4,2023-01-01,4,demo_model,1.0.1,18.229908,0.0,2.590361e+00,-13.291135,-368.447875,55.0,-6.030039,b'WHY1\x00\x92\x03\n\x0e \xc7\xae\xa1\xfb\xf70(\xc7\xae\xa1\xfb\xf70\x12\x12\n\tfeature_5\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xd3\xae\xa1\xfb\xf70(\xd3\xae\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,1000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2091,2023-03-29,3,demo_model,1.0.1,16.853651,0.0,1.739130e+00,16.573775,-226.308186,22.0,-0.404220,b'WHY1\x00\x8d\x03\n\x0e \xb3\xbe\xa3\xfb\xf70(\xb3\xbe\xa3\xfb\xf70\x12\x11\n\tfeature_3\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xbb\xbe\xa3\xfb\xf70(\xbb\xbe\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,188
2092,2023-03-29,4,demo_model,1.0.1,30.805862,1.0,4.736842e+00,16.573775,-355.970607,44.0,-4.701136,b'WHY1\x00\x8d\x03\n\x0e \xbf\xbe\xa3\xfb\xf70(\xbf\xbe\xa3\xfb\xf70\x12\x11\n\tfeature_6\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xc5\xbe\xa3\xfb\xf70(\xc5\xbe\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,125
2093,2023-03-29,5,demo_model,1.0.1,9.924586,1.0,3.730159e+00,16.573775,-356.026755,22.0,-2.173682,b'WHY1\x00\x8d\x03\n\x0e \xc8\xbe\xa3\xfb\xf70(\xc8\xbe\xa3\xfb\xf70\x12\x11\n\tfeature_3\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xce\xbe\xa3\xfb\xf70(\xce\xbe\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,67
2094,2023-03-29,6,demo_model,1.0.1,30.811672,0.0,2.500000e+00,7.376024,-264.412372,33.0,3.319832,b'WHY1\x00\x8d\x03\n\x0e \xd1\xbe\xa3\xfb\xf70(\xd1\xbe\xa3\xfb\xf70\x12\x11\n\tfeature_2\x12\x0...,b'WHY1\x00\xb0\x02\n\x0e \xd7\xbe\xa3\xfb\xf70(\xd7\xbe\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,74


### Merge Whylogs Profiles

In [35]:
type(feb_whylogs_prof)

whylogs.core.view.dataset_profile_view.DatasetProfileView

In [36]:
feb_whylogs_prof.to_pandas()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,distribution/max,distribution/mean,distribution/median,distribution/min,distribution/n,distribution/q_01,distribution/q_05,distribution/q_10,distribution/q_25,distribution/q_75,distribution/q_90,distribution/q_95,distribution/q_99,distribution/stddev,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
feature_5,425.569275,420.134311,431.139744,0,1000,0,0,38148.0,2184.743,715.0,0.0,1000,11.0,22.0,33.0,55.0,3190.0,6248.0,9141.0,14388.0,3310.736242,SummaryType.COLUMN,0,1000,0,0,0,0
feature_6,233.000134,233.0,233.011768,0,1000,0,0,1.785209,-1.798066,-1.543214,-6.845155,1000,-5.256787,-4.975246,-4.793975,-4.1657,0.373155,1.207462,1.48241,1.724903,2.350622,SummaryType.COLUMN,0,1000,0,0,0,0


In [37]:
mar_whylogs_prof.to_pandas()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,distribution/max,distribution/mean,distribution/median,distribution/min,distribution/n,distribution/q_01,distribution/q_05,distribution/q_10,distribution/q_25,distribution/q_75,distribution/q_90,distribution/q_95,distribution/q_99,distribution/stddev,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
feature_5,13.0,13.0,13.000649,0,267,0,0,154.0,39.426966,33.0,11.0,267,11.0,22.0,22.0,22.0,44.0,55.0,110.0,132.0,24.119169,SummaryType.COLUMN,0,267,0,0,0,0
feature_6,134.000044,134.0,134.006735,0,267,0,0,1.815312,-1.352451,-0.373155,-5.019625,267,-5.019625,-4.793975,-4.630565,-3.987045,0.838042,1.48241,1.573573,1.785209,2.434174,SummaryType.COLUMN,0,267,0,0,0,0


In [38]:
merged_prof_view = feb_whylogs_prof.merge(mar_whylogs_prof)
merged_prof_view.to_pandas()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,distribution/max,distribution/mean,distribution/median,distribution/min,distribution/n,distribution/q_01,distribution/q_05,distribution/q_10,distribution/q_25,distribution/q_75,distribution/q_90,distribution/q_95,distribution/q_99,distribution/stddev,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
feature_5,425.569275,420.134311,431.139744,0,1267,0,0,38148.0,1732.651934,220.0,0.0,1267,11.0,22.0,22.0,44.0,2310.0,5500.0,7546.0,13640.0,3068.471691,SummaryType.COLUMN,0,1267,0,0,0,0
feature_6,235.000137,235.0,235.01187,0,1267,0,0,1.815312,-1.70416,-1.390994,-6.845155,1267,-5.214552,-4.952914,-4.770901,-4.140412,0.528394,1.268737,1.512827,1.755073,2.37447,SummaryType.COLUMN,0,1267,0,0,0,0


In [39]:
merge_test_df = features_df[((features_df['ds'] == '2023-02-10') | (features_df['ds'] == '2023-03-10')) & (features_df['hour'] == 5)]

In [40]:
merge_test_df['ds'].unique()

array(['2023-02-10', '2023-03-10'], dtype=object)

In [41]:
merged_whylogs_prof = why.log(merge_test_df[['feature_5', 'feature_6']]).view()

In [42]:
merged_whylogs_prof.to_pandas()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,distribution/max,distribution/mean,distribution/median,distribution/min,distribution/n,distribution/q_01,distribution/q_05,distribution/q_10,distribution/q_25,distribution/q_75,distribution/q_90,distribution/q_95,distribution/q_99,distribution/stddev,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
feature_5,425.569275,420.134311,431.139744,0,1267,0,0,38148.0,1732.651934,220.0,0.0,1267,11.0,22.0,22.0,44.0,2310.0,5500.0,7546.0,13640.0,3068.471691,SummaryType.COLUMN,0,1267,0,0,0,0
feature_6,235.000137,235.0,235.01187,0,1267,0,0,1.815312,-1.70416,-1.390994,-6.845155,1267,-5.235719,-4.952914,-4.770901,-4.140412,0.528394,1.268737,1.512827,1.724903,2.37447,SummaryType.COLUMN,0,1267,0,0,0,0


### Generate Daily Profiles

In [43]:
from functools import reduce
def profile_reduce(profiles_df: pd.DataFrame) -> pd.DataFrame:
    features_buf = reduce(
        lambda acc, x: acc.merge(x),
        profiles_df.features_profile.apply(DatasetProfileView.deserialize),
    ).serialize()
    predictions_buf = reduce(
        lambda acc, x: acc.merge(x),
        profiles_df.predictions_profile.apply(DatasetProfileView.deserialize),
    ).serialize()
    records = profiles_df.sample_records.sum()
    return profiles_df.head(1).assign(features_profile=features_buf, predictions_profile = predictions_buf, sample_records=records)

In [44]:
from fugue import transform

daily_feature_profile_df = transform(
    df=hourly_feature_profile_df, 
    using=profile_reduce, 
    schema="*",
    partition=dict(by=['ds', 'model_name', 'version']), 
    engine=None
)

In [45]:
daily_feature_profile_df

Unnamed: 0,ds,hour,model_name,version,predictions,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,features_profile,predictions_profile,sample_records
0,2023-01-01,0,demo_model,1.0.1,28.874601,0.0,5.000000e+08,-13.291135,-303.546123,957.0,-6.954191,b'WHY1\x00\x93\x03\n\x0e \xfe\xad\xa1\xfb\xf70(\xfe\xad\xa1\xfb\xf70\x12\x12\n\tfeature_4\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \x8c\xae\xa1\xfb\xf70(\x8c\xae\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,24001
1,2023-01-02,0,demo_model,1.0.1,17.241968,1.0,3.750000e+00,-0.000000,-226.315937,44.0,-6.295416,b'WHY1\x00\x93\x03\n\x0e \xa9\xb1\xa1\xfb\xf70(\xa9\xb1\xa1\xfb\xf70\x12\x12\n\tfeature_5\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \xb5\xb1\xa1\xfb\xf70(\xb5\xb1\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,23999
2,2023-01-03,0,demo_model,1.0.1,5.838543,1.0,5.000000e+08,13.291135,-222.223722,33.0,-6.190258,b'WHY1\x00\x93\x03\n\x0e \xec\xb4\xa1\xfb\xf70(\xec\xb4\xa1\xfb\xf70\x12\x12\n\tfeature_6\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \x81\xb5\xa1\xfb\xf70(\x81\xb5\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,24000
3,2023-01-04,0,demo_model,1.0.1,15.285272,1.0,2.631579e+00,16.573775,-355.747054,363.0,-6.324387,b'WHY1\x00\x93\x03\n\x0e \xa5\xb9\xa1\xfb\xf70(\xa5\xb9\xa1\xfb\xf70\x12\x10\n\tfeature_1\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \xb1\xb9\xa1\xfb\xf70(\xb1\xb9\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,24000
4,2023-01-05,0,demo_model,1.0.1,5.200397,1.0,5.000000e+08,7.376024,-223.297205,33.0,-6.862306,b'WHY1\x00\x93\x03\n\x0e \xa8\xbc\xa1\xfb\xf70(\xa8\xbc\xa1\xfb\xf70\x12\x12\n\tfeature_2\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \xb4\xbc\xa1\xfb\xf70(\xb4\xbc\xa1\xfb\xf70\x12\x12\n\x0bpredictions\x1...,24000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,2023-03-25,0,demo_model,1.0.1,20.648773,0.0,8.088235e-01,-16.573775,-368.562824,22.0,-7.097175,b'WHY1\x00\x92\x03\n\x0e \x9f\xb6\xa3\xfb\xf70(\x9f\xb6\xa3\xfb\xf70\x12\x12\n\tfeature_4\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \xa7\xb6\xa3\xfb\xf70(\xa7\xb6\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,4248
84,2023-03-26,0,demo_model,1.0.1,13.163588,1.0,2.500000e+00,-13.291135,-263.689394,33.0,-6.689304,b'WHY1\x00\x93\x03\n\x0e \xaa\xb8\xa3\xfb\xf70(\xaa\xb8\xa3\xfb\xf70\x12\x12\n\tfeature_6\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \xb2\xb8\xa3\xfb\xf70(\xb2\xb8\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,3784
85,2023-03-27,0,demo_model,1.0.1,29.717026,1.0,2.500000e+00,-0.000000,-213.566814,110.0,-5.962720,b'WHY1\x00\x93\x03\n\x0e \x9b\xba\xa3\xfb\xf70(\x9b\xba\xa3\xfb\xf70\x12\x12\n\tfeature_4\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \xa2\xba\xa3\xfb\xf70(\xa2\xba\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,3686
86,2023-03-28,0,demo_model,1.0.1,11.663711,1.0,5.520833e+00,13.291135,-263.896567,44.0,-6.435433,b'WHY1\x00\x92\x03\n\x0e \x82\xbc\xa3\xfb\xf70(\x82\xbc\xa3\xfb\xf70\x12\x12\n\tfeature_6\x12\x0...,b'WHY1\x00\xb1\x02\n\x0e \x88\xbc\xa3\xfb\xf70(\x88\xbc\xa3\xfb\xf70\x12\x12\n\x0bpredictions\x1...,5290


### Scaling up with Fugue & Spark [For Reference only. This docker does not have Spark installed]