In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from os import listdir
from os.path import isfile, join

In [None]:
MYPATH = '../PROD/'
[f for f in listdir(MYPATH) if (isfile(join(MYPATH, f))) & (f != 'tmp')]

In [None]:
#Concatenate production files into python data array
onlyfiles = [f for f in listdir(MYPATH) if (isfile(join(MYPATH, f))) & (f != 'tmp')]
data = []
for file in onlyfiles:
    with open(join(MYPATH,file), 'r') as f:
        data.extend(f.readlines())

In [None]:
# remove the trailing "\n" from each line
data = map(lambda x: x.rstrip(), data)

# each element of 'data' is an individual JSON object.
# i want to convert it into an *array* of JSON objects
# which, in and of itself, is one large JSON object
# basically... add square brackets to the beginning
# and end, and have all the individual business JSON objects
# separated by a comma
data_json_str = "[" + ''.join(data)
data_json_str = data_json_str[:-1] + "]"

# now, load it into pandas
data_df = pd.read_json(data_json_str)

# Parse the properties json content to df new columns
new_col_names=['agent_id', 'what', 'class_name', 'feat_name', 'stat']
for col in new_col_names:
    data_df[col] = data_df.loc[:,'properties'].apply(lambda x: x[col] if col in x.keys() else '')

data_df['id'] = data_df.apply(lambda x: x['what'] + '_' + x['class_name'] + x['feat_name'] + '_' + x['stat'], axis=1)

#drop duplicate lines
df = data_df.drop(['properties', 'tags'], axis=1)

In [None]:
# GENERAL INFO

In [None]:
def display_info(df):
    print('Number of data sources (agents) : {}'.format(len(df.agent_id.unique())))
    print('Name of agents : {} \n'.format([x for x in df.agent_id.unique()]))
    for agent_name in df.agent_id.unique():
        df_agent = df[df.agent_id == agent_name]
        print('-- {} --'.format(agent_name))
        print('Number of time series : {}'.format(len(df_agent.id.unique())))
        print('Number of feature time series analytics = {}'.format(len(df_agent[df_agent.feat_name != ''].id.unique())))
        print('Number of prediction time series analytics = {}'.format(len(df_agent[['prediction' in x for x in df_agent.what.values]].id.unique())))
        print('Number of label time series analytics = {}'.format(len(df_agent[[('prediction' not in x)&('feature' not in x) for x in df_agent.what.values]].id.unique())))
        print('Start date : {}'.format(df_agent.timestamp.min()))
        print('End date : {}\n'.format(df_agent.timestamp.max()))

In [None]:
display_info(df)

# SELECT A DATA SOURCE TO ANALYZE

In [None]:
# If you want to analyze a specific dataset
df = df[df['agent_id'] == 'MNIST_Multiclass_wrong_inputs']

# CLASS FREQUENCY DRIFT GRAPH

In [None]:
df_prediction = df[df['what'] == 'prediction_freq']

In [None]:
classes_names = df_prediction['class_name'].unique()

x = df_prediction['timestamp'].unique()
y = np.vstack([df_prediction[df_prediction.class_name == class_name]['value'].values for class_name in classes_names])

plt.figure(figsize=(20,10))
_ = plt.stackplot(x, y, labels = classes_names)
plt.legend(loc='upper left')
plt.show()

# BATCH SIZE GRAPH

In [None]:
df_batchsize=df[df['what'] == 'prediction_count'].groupby('timestamp')['value'].sum()
df_batchsize.plot(x='timestamp', figsize = (15,2), title='batch_size')
print('BATCH SIZE GRAPH')

# FEATURE METRICS DISTRIBUTION GRAPH

In [None]:
for metric in df['id'].unique()[1000:1050]:
    if 'feature' in metric:
        df_=df[df['id'] == metric]
        df_.plot(x='timestamp', y='value', figsize = (15,2), title=str(metric))
print('INPUT METRIX GRAPHS(first 50 metrix) : {} graphs'.format(sum(['feature' in x for x in df['id'].unique()])))

# PREDICTION METRICS DISTRIBUTION GRAPH

In [None]:
for metric in df['id'].unique()[:]:
    if 'prediction' in metric:
        df_=df[df['id'] == metric]
        df_.plot(x='timestamp', y='value', figsize = (15,2), title=str(metric))
print('OUTPUT METRIX GRAPHS (all metrix) : {} graphs'.format(sum(['prediction' in x for x in df['id'].unique()])))

# LABEL METRICS DISTRIBUTION GRAPH

In [None]:
for metric in df['id'].unique()[:]:
    if ('prediction' not in metric) & ('feature' not in metric):
        df_=df[df['id'] == metric]
        df_.plot(x='timestamp', y='value', figsize = (15,2), title=str(metric))
print('LABELS METRIX GRAPHS (all metrix) : {} graphs'.format(sum([('prediction' not in x) & ('feature' not in x) for x in df['id'].unique()])))