# Table of Contents
 <p><div class="lev2 toc-item"><a href="#Solution-to-problem-2" data-toc-modified-id="Solution-to-problem-2-01"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Solution to problem 2</a></div><div class="lev2 toc-item"><a href="#Sanity-Check" data-toc-modified-id="Sanity-Check-02"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Sanity Check</a></div>

## Solution to problem 2

In [1]:
import pandas as pd
import numpy as np
import h5py


In [12]:
class Converter():
    def __init__(self):
        self.datasets = {}
        self.tree = []
        self.df = None
        self.checksum = {} # for sanity check
        
    def __call__(self, name, node):
        if isinstance(node, h5py.Dataset):
            self.datasets[name] = node
        # Save to print like a direcotry tree
        shift = name.count('/') * '    '
        item_name = name.split("/")[-1]
        self.tree.append(shift + item_name + "\n")
        return None
    
    def __str__(self):
        return "".join(self.tree)
    
    def to_dataframe(self):
        if self.df: 
            return self.df
        dfs = []
        for name, node in self.datasets.items():
            df = pd.DataFrame(node, dtype=None) # infer float/int 64
            self.checksum[name] = df.sum(axis=0)
            new_cols = [name.split('/')[-1] + 
                        "_" + str(col) for col in df.columns]
            df = df.rename(columns=dict(zip(df.columns, new_cols)))
            dfs.append(df)
        self.df = pd.concat(dfs, axis=1)
        return self.df

In [13]:
filename = "sample.h5"

In [53]:
with h5py.File(filename, "r") as f:
    data = Converter()
    f.visititems(data)
    print("Tree hierachy:")
    print(data)
    df = data.to_dataframe()
    


Tree hierachy:
tick
    ask_px
    ask_vol
    bid_px
    bid_vol
    high_px
    last_px
    low_px
    open_interest
    status
    time
    turnover
    unix
    volume



In [15]:
df.head(3)

Unnamed: 0,ask_px_0,ask_px_1,ask_px_2,ask_px_3,ask_px_4,ask_vol_0,ask_vol_1,ask_vol_2,ask_vol_3,ask_vol_4,...,bid_vol_4,high_px_0,last_px_0,low_px_0,open_interest_0,status_0,time_0,turnover_0,unix_0,volume_0
0,,,,,,0,0,0,0,0,...,0,,3671.4,,138633,2,1420528084100000000,0.0,1420499284100000000,0
1,3664.0,3665.0,3665.8,3666.0,3666.6,13,8,1,25,3,...,5,,3664.0,,138768,2,1420535640100000000,877161600.0,1420506840100000000,798
2,3663.0,3663.8,3664.0,3664.6,3664.8,5,8,14,1,1,...,5,,3663.0,,138790,2,1420535700100000000,957376200.0,1420506900100000000,871


In [16]:
df.columns

Index(['ask_px_0', 'ask_px_1', 'ask_px_2', 'ask_px_3', 'ask_px_4', 'ask_vol_0',
       'ask_vol_1', 'ask_vol_2', 'ask_vol_3', 'ask_vol_4', 'bid_px_0',
       'bid_px_1', 'bid_px_2', 'bid_px_3', 'bid_px_4', 'bid_vol_0',
       'bid_vol_1', 'bid_vol_2', 'bid_vol_3', 'bid_vol_4', 'high_px_0',
       'last_px_0', 'low_px_0', 'open_interest_0', 'status_0', 'time_0',
       'turnover_0', 'unix_0', 'volume_0'],
      dtype='object')

In [17]:
df.to_csv("sample.csv", index=False)

## Sanity Check

In [48]:
# checksum includes sum of each column in h5, df.sum() 
# returns the sum in the combined dataframe

In [44]:
checksum = pd.Series([col_sum for i in data.checksum.values() for col_sum in i.values])

In [52]:
assert ( (checksum - df.sum().values) == np.zeros_like(29) ).all()