### Cerberus Example

In [1]:
from cerberus import Validator
from datetime import datetime
import pandas as pd

In [2]:
example_scheme = {
    'letter': {'type': 'string'},
    'number': {'type': 'integer'}
}
v = Validator(example_scheme, require_all=True)

example_event = {
    'letter': 'A',
    'number': 1
}
print(v.validate(example_event))
v.errors

True


{}

### Single Scheme

In [3]:
payment_btn_click_scheme = {
    'dt': {'type': 'datetime'},
    'device_id': {'type': 'string'},
    'user_id': {'type': 'integer'},
    'name': {'type': 'string', 'allowed': ['payment_btn_click']},
    'order_id': {'type': 'integer'}
}
v = Validator(payment_btn_click_scheme, require_all=True)

In [4]:
payment_btn_click_events = [
    {
        #ok
        'dt': datetime.strptime('2023-01-10 12:23:01','%Y-%m-%d %H:%M:%S'),
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn_click',
        'order_id': 101010
    },
    {
        #missing order_id
        'dt': datetime.strptime('2023-01-10 12:23:01','%Y-%m-%d %H:%M:%S'),
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn_click'
    },
    {
        #wrong event name
        'dt': datetime.strptime('2023-01-10 12:23:01','%Y-%m-%d %H:%M:%S'),
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn',
        'order_id': 101010
    },
    {
        #wrong user_id type
        'dt': datetime.strptime('2023-01-10 12:23:01','%Y-%m-%d %H:%M:%S'),
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'payment_btn_click',
        'order_id': 101010
    }
]

for i,e in enumerate(payment_btn_click_events):
    if v.validate(e):
        print(f'{i}: ok')
    else:
        print(f'{i}: {v.errors}')

0: ok
1: {'order_id': ['required field']}
2: {'name': ['unallowed value payment_btn']}
3: {'user_id': ['must be of integer type']}


### Multiple Schemes

In [5]:
known_events = ['payment_btn_click', 'main_banner_click']

common_fields_scheme = {
    'dt': {'type': 'datetime', 'coerce': lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S')},
    'device_id': {'type': 'string'},
    'user_id': {'type': 'integer', 'nullable': True},
    'name': {'type': 'string', 'allowed': known_events}
}


payment_btn_click_scheme = {
    'name': {'type': 'string', 'allowed': ['payment_btn_click']},
    'order_id': {'type': 'integer'}
}

main_banner_click_scheme = {
    'name': {'type': 'string', 'allowed': ['main_banner_click']},
    'banner_id': {'type': 'integer'}
}


events_specs = {
    'payment_btn_click': payment_btn_click_scheme,
    'main_banner_click': main_banner_click_scheme
}
#common_fields_scheme['name'].update({'type': 'string', 'allowed': list(events_specs.keys())})

In [6]:
events = [
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn',
        'order_id': 101010
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'payment_btn_click',
        'order_id': '101010',
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'main_banner_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'main_banner_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 890
    }
]

v = Validator()
v.allow_unknown = True
v.require_all = True

for i,e in enumerate(events):
    if not v.validate(e, common_fields_scheme):
        print(f'{e.get("name")}: {v.errors}')
        
for e_name, s in events_specs.items():
    filtered_events = filter(lambda x: x.get('name') == e_name, events)
    for i, e in enumerate(filtered_events):
        if not v.validate(e, s):
            print(f'{e_name}, {i}: {v.errors}')

payment_btn: {'name': ['unallowed value payment_btn']}
payment_btn_click: {'user_id': ['must be of integer type']}
main_banner_click: {'user_id': ['must be of integer type']}
main_banner_click: {'user_id': ['must be of integer type']}
890: {'name': ['must be of string type'], 'user_id': ['must be of integer type']}
payment_btn_click, 0: {'order_id': ['required field']}
payment_btn_click, 1: {'order_id': ['must be of integer type']}
main_banner_click, 0: {'banner_id': ['required field']}
main_banner_click, 1: {'banner_id': ['required field']}


### Errors Summary

In [7]:
events = [
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn',
        'order_id': 101010
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'payment_btn_click',
        'order_id': '101010',
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'main_banner_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'main_banner_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': '890'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
    }    
]

v = Validator()
v.allow_unknown = True
v.require_all = True

def gather_errors(errs, event, validate_errors):
    #todo: use custom error handler?
    for field, msgs in validate_errors.items():
        for m in msgs:
            errs.append([event, field, m])

ev_cnt = {}
errs = []
for e in events:
    e_name = e.get("name", "missing_name")
    v.validate(e, common_fields_scheme)
    gather_errors(errs, e_name, v.errors)
    s = events_specs.get(e_name)
    if s:
        v.validate(e, s)
        gather_errors(errs, e_name, v.errors)
    ev_cnt[e_name] = ev_cnt.get(e_name, 0) + 1

df_evcnt = pd.DataFrame({'event': list(ev_cnt.keys()), 'processed_events': list(ev_cnt.values())})
display(df_evcnt)

df_errs = pd.DataFrame(errs, columns=['event', 'field', 'msg'])
df_errs['err_cnt'] = 1
df_errs_cnt = df_errs.groupby(['event', 'field', 'msg'])['err_cnt'].sum().reset_index()
display(df_errs_cnt)

pd.merge(df_errs_cnt, df_evcnt, on='event', how='left')

Unnamed: 0,event,processed_events
0,payment_btn_click,2
1,payment_btn,1
2,main_banner_click,2
3,890,1
4,missing_name,1


Unnamed: 0,event,field,msg,err_cnt
0,890,name,unallowed value 890,1
1,890,user_id,must be of integer type,1
2,main_banner_click,banner_id,required field,2
3,main_banner_click,user_id,must be of integer type,2
4,missing_name,name,required field,1
5,missing_name,user_id,must be of integer type,1
6,payment_btn,name,unallowed value payment_btn,1
7,payment_btn_click,order_id,must be of integer type,1
8,payment_btn_click,order_id,required field,1
9,payment_btn_click,user_id,must be of integer type,1


Unnamed: 0,event,field,msg,err_cnt,processed_events
0,890,name,unallowed value 890,1,1
1,890,user_id,must be of integer type,1,1
2,main_banner_click,banner_id,required field,2,2
3,main_banner_click,user_id,must be of integer type,2,2
4,missing_name,name,required field,1,1
5,missing_name,user_id,must be of integer type,1,1
6,payment_btn,name,unallowed value payment_btn,1,1
7,payment_btn_click,order_id,must be of integer type,1,2
8,payment_btn_click,order_id,required field,1,2
9,payment_btn_click,user_id,must be of integer type,1,2


### Nested Fields

In [8]:
known_events = ['payment_btn_click', 'main_banner_click']

common_fields_scheme = {
    'dt': {'type': 'datetime', 'coerce': lambda x: datetime.strptime(x,'%Y-%m-%d %H:%M:%S')},
    'device_id': {'type': 'string'},
    'user_id': {'type': 'integer', 'nullable': True},
    'name': {'type': 'string', 'allowed': known_events},
    'params': {'type': 'dict'}
}


payment_btn_click_params_scheme = {
    'order_id': {'type': 'integer'},
    'exp_id': {'type': 'integer'}
}

main_banner_click_params_scheme = {
    'banner_id': {'type': 'integer'},
    'exp_id': {'type': 'integer'}
}


event_params_specs = {
    'payment_btn_click': payment_btn_click_params_scheme,
    'main_banner_click': main_banner_click_params_scheme
}

#todo: combine common_fields_scheme and paramters_schemes into single schemes?
#common_fields_scheme['name'].update({'type': 'string', 'allowed': list(events_specs.keys())})

In [9]:
events = [
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': 1231231,
        'name': 'payment_btn_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'payment_btn_click',
        'params': {
            'order_id': '101010'
        }
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'payment_btn_click',
        'params': {
            'order_id': 101010,
            'exp_id': 9990
        }
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'main_banner_click'
    },
    {
        'dt': '2023-01-10 12:23:01',
        'device_id': 'abc123-efg456',
        'user_id': '1231231',
        'name': 'main_banner_click',
        'params': {
            'banner_id': 1789789,
            'exp_id': 9990
        }
    }   
]

v = Validator()
v.allow_unknown = True
v.require_all = True

def gather_errors(errs, event, validate_errors, position_prefix=''):
    #todo: use custom error handler?
    #use document_error_tree for nodes location?
    for field, msgs in validate_errors.items():
        for m in msgs:
            errs.append([event, position_prefix + field, m])

ev_cnt = {}
errs = []
for e in events:
    e_name = e.get("name", "missing_name")
    v.validate(e, common_fields_scheme)
    gather_errors(errs, e_name, v.errors)
    s = event_params_specs.get(e_name)
    pars = e.get('params')
    if s and pars:
        v.validate(pars, s)
        gather_errors(errs, e_name, v.errors, position_prefix='params.')
    ev_cnt[e_name] = ev_cnt.get(e_name, 0) + 1


df_evcnt = pd.DataFrame({'event': list(ev_cnt.keys()), 'processed_events': list(ev_cnt.values())})
#display(df_evcnt)

df_errs = pd.DataFrame(errs, columns=['event', 'field', 'msg'])
df_errs['err_cnt'] = 1
df_errs_cnt = df_errs.groupby(['event', 'field', 'msg'])['err_cnt'].sum().reset_index()
#display(df_errs_cnt)

pd.merge(df_errs_cnt, df_evcnt, on='event', how='left')

Unnamed: 0,event,field,msg,err_cnt,processed_events
0,main_banner_click,params,required field,1,2
1,main_banner_click,user_id,must be of integer type,2,2
2,payment_btn_click,params,required field,1,3
3,payment_btn_click,params.exp_id,required field,1,3
4,payment_btn_click,params.order_id,must be of integer type,1,3
5,payment_btn_click,user_id,must be of integer type,2,3
