In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Statistics

This notebook provides examples for extracting useful statistics from processed hdf5 data.

In [None]:
from e11 import H5Data
from e11.stats import statistics
from e11.tools import add_level
from e11.process import vrange

In [None]:
# read file
import os 
fil = os.path.join(os.getcwd(), 'example_data', 'array_data.h5')
data = H5Data(fil)

In [None]:
data.log

## Vrange

Here, we are applying the vrange function to measure the vertical range of array data.

In [None]:
rng = data.apply(vrange, data.squids, 'OSC_0')
rng.head()

In [None]:
rng.describe()

##  Statistics

The `statistics()` function can be used to find the average vrange value per squid.

In [None]:
av = statistics(rng, mode='full')
av.head()

In [None]:
# map measurements to var values
df = add_level(data.var, 'VAR').join(av)
df.head()

In [None]:
# plot
fig, ax = plt.subplots()

xvals = df[('VAR', 'PAUSE')]
yvals = df[('vrange', 'mean')]
yerr =  df[('vrange', 'err')]
ax.errorbar(xvals, yvals, yerr=yerr, marker='o')

#format
ax.set_xlabel('Pause (ms)')
ax.set_ylabel('signal (arb. units)')

#output
plt.show()

## Loops

It's not unusual for different squids to be run using the same experimental conditions (VARS), e.g., if looping an experiment script.

The plot above indicates that there were 2 loops recorded using the same values for VAR:PAUSE.  These can simply be grouped together to find the overall statistics for unique VAR values.

In [None]:
# map measurements to var values
df2 = data.var.join(rng)
df2.head()

In [None]:
# then evaluate the statistics when grouped by var values
st = statistics(df2, groupby=data.var.columns, mode='full')
st.head()

In [None]:
# plot
fig, ax = plt.subplots()

xvals = st.index.get_level_values('PAUSE')
yvals = st[('vrange', 'mean')]
yerr =  st[('vrange', 'err')]
ax.errorbar(xvals, yvals, yerr=yerr, marker='o')

#format
ax.set_xlabel('pause (ms)')
ax.set_ylabel('signal (arb. units)')

#output
plt.show()

## Combing data from multiple runs

Sometimes it's useful to combine data from different runs using pandas.concat.

In [None]:
# import data
run_ids = ['20180720_000', '20180720_001']
all_data = dict()
for rid in run_ids:
    # normally we'd use run_file()
    # >>> fil = run_file(base="Q:\E11_atmos\data", rid=rid)
    # but for now we'll use the example data, again, err.., twice.
    fil = os.path.join(os.getcwd(), 'example_data', 'array_data.h5')
    # open
    all_data[rid] = H5Data(fil)

### workflow #1

Seperate results for each run -- e.g., for comparing runs.

In [None]:
all_st = dict()
for rid in run_ids:
    # calculate vertical range
    data = all_data[rid]
    rng = data.apply(vrange, data.squids, 'OSC_0', tqdm_disable=True)
    df = data.var.join(rng)
    # calculate statistics
    st = statistics(df2, groupby=data.var.columns, mode='full', tqdm_disable=True)
    all_st[rid] = st
# result
all_st = pd.concat(all_st, names=['rid'])
all_st

### workflow #2

Combine results -- e.g., to improve statistics (don't actually do this using identical data!).

In [None]:
all_df = dict()
for rid in run_ids:
    data = all_data[rid]
    rng = data.apply(vrange, data.squids, 'OSC_0', tqdm_disable=True)
    all_df[rid] = data.var.join(rng)
# combine vertical range measurements
all_df = pd.concat(all_df, names=['rid'])
all_df.head()

In [None]:
# calculate statistics
st = statistics(all_df, groupby=data.var.columns, mode='full')
st