In [1]:
%matplotlib inline

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Combing data from multiple runs

Often it's useful to combine data from different experimental runs. 

This notebook provides example workflows for dealing with multiple hdf5 data files using `pandas.concat()`.

In [3]:
from e11 import H5Data
from e11.stats import statistics
from e11.process import vrange

In [4]:
# import data
run_ids = ['20180720_000', '20180720_001']
all_data = dict()
for rid in run_ids:
    # normally we'd use run_file()
    # >>> fil = run_file(base="Q:\E11_atmos\data", rid=rid)
    # but for now we'll use the example data, again, err.., twice.
    fil = os.path.join(os.getcwd(), 'example_data', 'array_data.h5')
    # open
    data = H5Data(fil, update_log=True)
    all_data[rid] = data

100%|██████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1998.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1203.24it/s]


### workflow #1

Seperate results for each run -- e.g., for comparing runs.

In [5]:
all_st = dict()
for rid in run_ids:
    # calculate vertical range
    data = all_data[rid]
    rng = data.apply(vrange, data.squids, 'OSC_0', tqdm_disable=True)
    df = data.var.join(rng)
    # calculate statistics
    st = statistics(df, groupby=data.var.columns, mode='full', tqdm_disable=True)
    all_st[rid] = st
# result
all_st = pd.concat(all_st, names=['rid'])
all_st

Unnamed: 0_level_0,Unnamed: 1_level_0,vrange,vrange,vrange,vrange,vrange,vrange,vrange,vrange
Unnamed: 0_level_1,Unnamed: 1_level_1,count,err,max,mean,median,min,range,std
rid,PAUSE,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
20180720_000,300,50,0.00226,1.083854,1.041386,1.040635,1.000539,0.083316,0.015981
20180720_000,600,50,0.002367,1.06408,1.037024,1.042134,0.997321,0.066759,0.016739
20180720_000,900,51,0.001702,1.069203,1.040724,1.041619,1.012878,0.056325,0.012155
20180720_001,300,50,0.00226,1.083854,1.041386,1.040635,1.000539,0.083316,0.015981
20180720_001,600,50,0.002367,1.06408,1.037024,1.042134,0.997321,0.066759,0.016739
20180720_001,900,51,0.001702,1.069203,1.040724,1.041619,1.012878,0.056325,0.012155


### workflow #2

Combine results -- e.g., to improve statistics (don't actually do this using identical data!).

In [6]:
all_df = dict()
for rid in run_ids:
    data = all_data[rid]
    rng = data.apply(vrange, data.squids, 'OSC_0', tqdm_disable=True)
    all_df[rid] = data.var.join(rng)
# combine vertical range measurements
all_df = pd.concat(all_df, names=['rid'])
all_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PAUSE,vrange
rid,squid,measurement,Unnamed: 3_level_1,Unnamed: 4_level_1
20180720_000,1,0,300,1.033028
20180720_000,1,1,300,1.012191
20180720_000,1,2,300,1.032403
20180720_000,1,3,300,1.070328
20180720_000,1,4,300,1.062143


In [7]:
# calculate statistics
st = statistics(all_df, groupby=data.var.columns, mode='full')
st

Unnamed: 0_level_0,vrange,vrange,vrange,vrange,vrange,vrange,vrange,vrange
Unnamed: 0_level_1,count,err,max,mean,median,min,range,std
PAUSE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
300,100,0.00159,1.083854,1.041386,1.040635,1.000539,0.083316,0.015901
600,100,0.001665,1.06408,1.037024,1.042134,0.997321,0.066759,0.016654
900,102,0.001198,1.069203,1.040724,1.041619,1.012878,0.056325,0.012095
