## Usage

This is a helper notebook. Run it from another notebook as:

```
%run ../common/benchmark_analysis.ipynb 
```

## Input

Set `data_absolute_path` to the file with JSON data to be analyzed.
This data should be generated using the `notsofine::benchmark_run` from `harness`.

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

plt.style.use('seaborn-whitegrid')

In [None]:
# Uncomment for debugging.

# import os
# data_absolute_path = os.path.join(os.getcwd(), '..', 'benchmark_trivial', 'data.json')

In [None]:
print("### Loading data from " + data_absolute_path)

with open(data_absolute_path) as f:
    data = json.loads(f.read())
df = pd.json_normalize(data['iterations'], 'runs', ['i'])

In [None]:
df['duration.total_ms'] = df['duration.secs'] * \
    1e3 + df['duration.nanos'] / 1e6
df['qps'] = 1.0e6 / df['duration.total_ms']
series = df.pivot(
    index='i',
    columns='program',
    values=['qps'])

print('### Statistics: Raw data')
print(series.describe())
series.plot(y='qps', kind='line',
            title='Measured queries per second in all benchmark runs')


# 

In [None]:
# Remove outliers by dropping rows where _any_ column value is more than 2 SDs away from the column mean.

mask = (np.abs(stats.zscore(series)) < 2).all(axis=1)
series_sans_outliers = series[mask]

print('### Statistics: After outlier removal')
print(series_sans_outliers.describe())
series_sans_outliers.plot(y='qps', kind='line',
                          title='Queries per second in all benchmark runs after outlier removal')


In [None]:
baseline = series_sans_outliers.columns[0][1]
ratio = series_sans_outliers.divide(series_sans_outliers.iloc[:, 0], axis=0).rename({
    'qps': 'qps'}, axis=1)
ratio = ratio.iloc[:, 1:]
print('## Ratio of qps compared to ' + baseline)
print(ratio.describe())
ratio.plot(y='qps', kind='line',
           title='Ratio of queries per second w.r.t.' + baseline)
