In [67]:
import ast
import traceback

from itertools import islice
import pandas as pd
import numpy as np
# import math

# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 4)
# pd.set_option('display.width', 1000)

# Analysis Steps
1. splits each logs into 3 parts, sim_write , agg_read , agg_write, 
2. splits each part into 2 dataframe, df_vol and df_vfd 
3. apply analysis based mapping and merge df_vol and df_vfd
4. saved each df_sim_merged, df_agg_read_merged, df_agg_write_merged into parquet format

In [68]:
def check_empty(df,p=True):
    nan_df = df[df.isna().any(axis=1)].copy()
    if not nan_df.empty:
        if p:
            print(f"NaN rows: {nan_df}") # check any nan
        return list(nan_df.index)
    
    null_df = df[df.isnull().any(axis=1)].copy()
    if not null_df.empty:
        if p:
            print(f"NULL rows: {null_df}") # check any null
        return list(null_df.index)
    
def rec_to_df(records):
    df = pd.DataFrame.from_dict(records,orient='index')
    df.replace('/mnt/ssd/mtang11/','',regex=True, inplace=True)
    df.replace('molecular_dynamics_runs/stage0000/','',regex=True, inplace=True)
    
    # df['hash_id']= df['hash_id'].astype(str)
    # df['hash_id'] = map(lambda x: x.encode('base64','strict'), df['hash_id'])
    if 'logical_addr' in df.columns:
        df['logical_addr'] = df['logical_addr'].fillna(-1.0) #.astype(int)
    if 'access_size' in df.columns:
        df['access_size'] = df['access_size'].fillna(0)
    #df['logical_addr'] = df['logical_addr'].astype(int)
    
    return df

def df_to_csv(df,file_name,suffix=''):
    # df.hash_id=df.hash_id.astype('category').cat.codes
    # out_csv=file_name.replace('prov-vfd-','')
    out_csv=file_name.replace('.log',f'{suffix}.csv')
    df.to_csv(out_csv,index=False)

def df_to_parquet(df,file_name,suffix=''):
    out_parquet=file_name.replace('.log',f'{suffix}.parquet')
    try:
        df.to_parquet(out_parquet, engine='pyarrow') # compression='gzip'
        # pyarrow has error with streamlit, downgrade to streamlit==0.84.2
    except:
        traceback.print_exc()

def print_list_diff(list1, list2):
    if list1 == list2:
        print("same lists")
    else:
        print("different lists\nindex:\t[list1]\t[list2]")
        for index, (first, second) in enumerate(zip(list1, list2)):
            if first != second:
                # print(index, first, second)
                print(f"{index}:\t{first}]\t[{second}]")


def split_vfd_vol_rec(fname,mode='r'):
    vol_r_ops = ['H5VLdataset_read', 'H5VLblob_get', 'H5FD__hermes_read']
    vol_w_ops = ['H5VLdataset_write', 'H5VLblob_put', 'H5FD__hermes_write']
    other_ops = ['_get','_create', '_close', '_open',]

    vol_rec = {}
    vfd_rec = {}
    vol_idx = 0
    vfd_idx = 0
    with open(fname, mode) as f:
        for line in f:
            try:
                rec = ast.literal_eval(line)
                
                if any(op in rec['func_name'] for op in vol_r_ops):
                    if 'hermes' in rec['func_name']:
                        vfd_rec[vfd_idx] = rec
                        vfd_rec[vfd_idx]['operation'] = 'write'
                        vfd_idx+=1
                    else:
                        vol_rec[vol_idx] = rec
                        vol_rec[vol_idx]['operation'] = 'read'
                        vol_idx+=1
                elif any(op in rec['func_name'] for op in vol_w_ops):
                    if 'hermes' in rec['func_name']:
                        vfd_rec[vfd_idx] = rec
                        vfd_rec[vfd_idx]['operation'] = 'write'
                        vfd_idx+=1
                    else:
                        vol_rec[vol_idx] = rec
                        vol_rec[vol_idx]['operation'] = 'write'
                        vol_idx+=1          
                # not record other_ops
                elif any(op in rec['func_name'] for op in other_ops):
                    _suffix = rec['func_name'].split('_')[-1]
                    if 'hermes' in rec['func_name']:
                        vfd_rec[vfd_idx] = rec
                        vfd_rec[vfd_idx]['operation'] = _suffix
                        vfd_idx+=1
                    else:
                        vol_rec[vol_idx] = rec
                        vol_rec[vol_idx]['operation'] = _suffix
                        vol_idx+=1
                    
            except:
                print("Erro line:")
                print(line)
                # break
    return vol_rec, vfd_rec

def add_all_op_type(dfvol, dfvfd, vol_map={}, vfd_map={}):
    if bool(vol_map) == False:
        vol_map={'read':'data', 'write': 'data'}
        vfd_map={'H5FD_MEM_DRAW':'data', 'H5FD_MEM_LHEAP': 'lheap'}

    dfvol['op_type'] = dfvol['operation'].map(vol_map)
    dfvol['op_type'] = dfvol['op_type'].fillna('meta')
    
    dfvfd['mem_type'] = dfvfd['mem_type'].fillna('H5FD_MEM_NTYPES')
    dfvfd['op_type'] = dfvfd['mem_type'].map(vfd_map)
    dfvfd['op_type'] = dfvfd['op_type'].fillna('meta')

def add_vol_op_type(dfvol, vol_map={}):
    if bool(vol_map) == False:
        vol_map={'read':'data', 'write': 'data'}

    dfvol['op_type'] = dfvol['operation'].map(vol_map)
    dfvol['op_type'] = dfvol['op_type'].fillna('meta')


In [71]:
# read in simulation data
def vol_data_label(df):
    data_idx_list = []
    dsets = list(set(df['file_no']))
    for index, row in df.iterrows():
        _pat = row['op_type']
        _dset = row['dset_name']
        _str = f"vol-{_pat}-{_dset}"
        if _pat =='data':
            _str = f"vol-{_pat}-{_dset}-{row['io_access_idx']}"
            
        data_idx_list.append(_str)

    df['data_label'] = data_idx_list
    
def vfd_data_label_sim_write(df):
    data_idx = 0
    data_idx_list = []
    dsets = list(set(df['file_no']))
    curr_dset = dsets.pop(0)
    dset_changed = False
    for index, row in df.iterrows():
        _pat = row['op_type']
        
        if _pat =='data':
            data_idx_list.append(f"vfd-{_pat}-{curr_dset}-{data_idx}")
            data_idx+=1
        else:
            data_idx_list.append(f"vfd-{_pat}-{curr_dset}")
            
        if row['file_no'] != curr_dset:
            data_idx = 0
            curr_dset = dsets.pop(0)

    df['data_label'] = data_idx_list

def fill_dset_offset(dfvol, dfvfd, fno2dset_dict={1:'contact_map', 2:'point_cloud'}):
    dsets_no = list(fno2dset_dict.keys())
    offset_dict = {}
    for no in dsets_no:
        # find first dataset starting offset
        # vfd_sdf[vfd_sdf['data_label'] == f'vfd-data-{dset}-0'].iloc[0]['start_addr']
        
        idx = dfvfd.data_label.str.match(f'vfd-data-{no}-0').idxmax()
        START_OFFSET = dfvfd.iloc[idx]['start_addr']
        offset_dict[fno2dset_dict[no]] = START_OFFSET
    
    dfvol['offset'] = dfvol['dset_name'].map(offset_dict)
    return dfvol

def hard_code_start_addr(dfvol, dfvfd):
    # # hardcode starting address
    dfvol = fill_dset_offset(dfvol, dfvfd)
    # starting address of blob
    idx_vfd = dfvfd.op_type.str.match('data').idxmax() # obtain from first vfd access 
    BLOB_START_ADDR = dfvfd.iloc[idx_vfd]['access_size'] +  dfvfd.iloc[idx_vfd]['start_addr'] 
    idx_vol = dfvol.func_name.str.match('H5VLblob_put').idxmax()
    dfvol.loc[idx_vol, ['logical_addr']] = BLOB_START_ADDR
    # print(dfvol.iloc[idx_vol])
    
    # starting address of dset
    dset_idx = dfvol.index[dfvol['func_name'] == 'H5VLdataset_write'].tolist()
    
    for dsi in dset_idx:
        dfvol.loc[dsi, ['logical_addr']] = dfvol.iloc[dsi]['offset']
    
    return dfvol

def merge_df_sim(dfvol, dfvfd):
    df_sim2vol = dfvol[['data_label', 'io_access_idx','dset_name', 'access_size', 'op_type', 'logical_addr',
                        'operation', 'n_elements','dimension_cnt','dimensions','file_intent','layout','time(us)']].copy()
    vol_idx2addr_map = dict(zip(df_sim2vol['data_label'], df_sim2vol['logical_addr']))
    df_sim2vol['logical_addr_map'] = df_sim2vol['logical_addr'].astype(int)

    df_vfd2res = dfvfd[['data_label', 'io_access_idx','access_size', 'op_type','start_addr',
                        'file_name', 'time(us)','operation']].copy()
    
    # df_vol2res.rename(columns={'start_addr':'logical_addr'}, inplace=True)
    df_vfd2res['start_addr'] = df_vfd2res['start_addr'].fillna(0)
    df_vfd2res['logical_addr_map'] = df_vfd2res['start_addr'].astype(int)
    
    # modify all meta mapping address and info
    df_sim2vol.loc[df_sim2vol['op_type'] == 'meta', 'logical_addr_map'] = 0
    df_vfd2res.loc[df_vfd2res['op_type'] == 'meta', 'logical_addr_map'] = 0
    
    df_vfd2res.rename(columns={'start_addr':'logical_addr_vfd'}, inplace=True)
    df_merged = df_sim2vol.merge(df_vfd2res, how='inner', on=['logical_addr_map'], suffixes=['_vol','_vfd'])

    # populate file_name
    df_merged['file_name'] = df_merged['file_name'].fillna('task0000/residue_100.h5')
    # df_merged['io_access_idx_vol'] = df_merged['io_access_idx_vol'].fillna(-1)
    # df_merged['access_size_vol'] = df_merged['access_size_vol'].fillna(0)
    # df_merged['access_size_vol'] = df_merged['access_size_vol'].replace(np.nan, 0)
    
    return df_merged, vol_idx2addr_map

fsim='../save_outputs/vol-vfd/prov-vfd-sim.log'
mode='r'

vol_sim_rec, vfd_sim_rec = split_vfd_vol_rec(fsim)

vol_sdf = rec_to_df(vol_sim_rec)
vfd_sdf = rec_to_df(vfd_sim_rec)

# add category
vol_sdf['cat'] = 'sim-write'
vfd_sdf['cat'] = 'sim-write'

# # bfill_cols = ['layout', 'type_size', 'n_elements', 'dimension_cnt', 'dimensions', 'dset_name', 'file_intent']

add_all_op_type(vol_sdf,vfd_sdf)
# # add data_label for simulation
vol_sdf['dset_name'] = vol_sdf['dset_name'].fillna(method='bfill')
vol_sdf['dset_name'] = vol_sdf['dset_name'].fillna(method='ffill')

vfd_data_label_sim_write(vfd_sdf)
vol_data_label(vol_sdf)
vol_sdf.loc[vol_sdf['op_type'] == 'meta', 'io_access_idx'] = -1
# check_empty(vol_sdf)

vol_sdf = hard_code_start_addr(vol_sdf,vfd_sdf)

# check_empty(vfd_sdf)
df_to_csv(vol_sdf,fsim,suffix='-vol')
df_to_csv(vfd_sdf,fsim,suffix='-vfd')

df_merged, vol_idx2addr_map = merge_df_sim(vol_sdf,vfd_sdf)
print(df_merged.columns)
df_to_csv(df_merged,fsim,suffix='-merged') # output merged to csv

# # TODO: use parquet for faster load in later analysis
# df_to_parquet(df_merged,fsim,suffix='-merged')
df_to_csv(df_merged,fsim,suffix='-merged')

Index(['data_label_vol', 'io_access_idx_vol', 'dset_name', 'access_size_vol',
       'op_type_vol', 'logical_addr', 'operation_vol', 'n_elements',
       'dimension_cnt', 'dimensions', 'file_intent', 'layout', 'time(us)_vol',
       'logical_addr_map', 'data_label_vfd', 'io_access_idx_vfd',
       'access_size_vfd', 'op_type_vfd', 'logical_addr_vfd', 'file_name',
       'time(us)_vfd', 'operation_vfd'],
      dtype='object')


In [72]:
# map only aggregation read

# read in aggregation data
def vfd_op_data_label_agg_read(df, dset_offsets):

    data_label_list = []
 
    curr_dset = 1
    next_offset = dset_offsets[curr_dset]
    dset_changed = False
    
    data_idx = 1
    last_addr = -1 # vfd has no -1 address
    pattern_list = []
    
    for index, row in df.iterrows(): #islice(df.iterrows(), start_idx, None):
        if row['mem_type'] == "H5FD_MEM_DRAW":
            if row['access_size'] == 4096:
                last_addr = row['next_addr']
                data_label_list.append(f'vfd-data-{curr_dset}-{data_idx}')
                pattern_list.append('loc')
            elif row['start_addr'] == last_addr:
                data_label_list.append(f'vfd-data-{curr_dset}-{data_idx}')
                pattern_list.append('data')
                data_idx+=1
                last_addr = 0
            else:
                data_label_list.append(f'vfd-data-{curr_dset}-{0}') # first access
                pattern_list.append('data')
        else:
            data_label_list.append(f'vfd-meta-{curr_dset}')
            last_addr = row['next_addr']
            pattern_list.append('meta')
            
        if row['start_addr'] == next_offset:
            data_idx = 0
            curr_dset+=1
            if len(dset_offsets) <= curr_dset:
                next_offset = 0
            else:
                next_offset = dset_offsets[curr_dset]
            dset_changed = True
    
    data_label_list[-1] = f'vfd-data-{curr_dset}-{data_idx}' # last idx for point_cloud
    
    df['data_label'] = data_label_list
    df['op_type'] = pattern_list
    
    return df

def merge_df_agg_read(dfvol, dfvfd,vol_idx2addr_map):
    df_agg2vol = dfvol[['data_label', 'io_access_idx','dset_name', 'access_size', 'op_type','logical_addr',
                        'operation', 'n_elements','dimension_cnt','dimensions','file_intent','layout','time(us)']].copy()
    df_vfd2res = dfvfd[['data_label', 'io_access_idx','access_size', 'op_type', 'next_addr','start_addr',
                        'file_name','time(us)','operation']].copy()

    # df_agg2vol['logical_addr'] = df_agg2vol['logical_addr'].astype(int)
    
    # address map for vol
    df_agg2vol['logical_addr_map'] = df_agg2vol['data_label'].map(vol_idx2addr_map)
    df_agg2vol.loc[df_agg2vol['op_type'] == 'meta', 'logical_addr_map'] = 0 # all meta map to 0
    
    # split by op_type
    df_vfd2res_meta = df_vfd2res[df_vfd2res['op_type'] == 'meta'].copy() # use 0 for meta
    df_vfd2res_loc = df_vfd2res[df_vfd2res['op_type'] == 'loc'].copy() # this match vol with start_addr
    df_vfd2res_data = df_vfd2res[df_vfd2res['op_type'] == 'data'].copy() # this match loc with next_addr
    
    # address map for vfd-meta
    df_vfd2res_meta['logical_addr_map'] = 0
    df_vfd2res_loc['logical_addr_map'] = df_vfd2res_loc['start_addr']
    
    # address map for vfd-data
    df_vfd2res_data_map  = dict(zip(df_vfd2res_loc['next_addr'], df_vfd2res_loc['start_addr']))
    df_vfd2res_data['logical_addr_map'] = df_vfd2res_data['start_addr'].map(df_vfd2res_data_map)
    
    nan_idx = check_empty(df_vfd2res_data,p=False)
    for idx in nan_idx:
        df_vfd2res_data.loc[idx, ['logical_addr_map']] = df_vfd2res.iloc[idx]['start_addr']
    
    # # rename and conver types
    # df_vol2res_meta['logical_addr_map'] = df_vol2res_meta['start_addr']
    # df_vol2res_loc['logical_addr_map'] = df_vol2res_loc['start_addr']
    
    df_vfd2res = pd.concat([df_vfd2res_meta,df_vfd2res_loc, df_vfd2res_data], axis=0) #.set_index('io_access_idx')

    # # df_vol2res_data['logical_addr_map'] = df_vol2res_data['logical_addr_map'].astype(int)
    df_vfd2res.rename(columns={'start_addr':'logical_addr'}, inplace=True)
    df_vfd2res['logical_addr'] = df_vfd2res['logical_addr'].replace(-1,0)
    
    df_merged = df_agg2vol.merge(df_vfd2res, how='inner', on=['logical_addr_map'], suffixes=['_vol','_vfd'])
    # df_merged['access_size_vol'] = df_merged['access_size_vol'].fillna(0)
    # df_merged['access_size_vol'] = df_merged['access_size_vol'].replace(np.nan, 0)
    
    return df_merged

fagg='../save_outputs/vol-vfd/prov-vfd-agg.log'

vol_agg_rec, vfd_agg_rec = split_vfd_vol_rec(fagg)

vol_adf = rec_to_df(vol_agg_rec)
vfd_adf = rec_to_df(vfd_agg_rec)

# add category, split read write based on filename
vol_adf['cat'] = np.where(vol_adf['file_name']== 'aggregate.h5', 'agg-write', 'agg-read')
vfd_adf['cat'] = np.where(vfd_adf['file_name']== 'aggregate.h5', 'agg-write', 'agg-read')

# add columns for mapping
vfd_adf['start_addr'] = vfd_adf['start_addr'].fillna(-1.0)
vfd_adf['access_size'] = vfd_adf['access_size'].fillna(0.0)
vfd_adf['next_addr'] = vfd_adf['start_addr'] + vfd_adf['access_size']
vfd_adf['next_addr'] = vfd_adf['next_addr'].astype(int)

# add mapping for data_label
vol_adf_read = vol_adf[vol_adf['cat'] == 'agg-read'].copy().reset_index()
vfd_adf_read = vfd_adf[vfd_adf['cat'] == 'agg-read'].copy().reset_index()

# get dset offsets from vol df
dset_offsets = list(set(vol_adf_read[vol_adf_read['func_name'] == 'H5VLdataset_read']['offset']))
# print(dset_offsets)

# # add data_label for vol
vol_adf_read['dset_name'] = vol_adf_read['dset_name'].fillna(method='bfill')
vol_adf_read['dset_name'] = vol_adf_read['dset_name'].fillna(method='ffill')
add_vol_op_type(vol_adf_read)
vol_data_label(vol_adf_read)
vol_adf_read.loc[vol_adf_read['op_type'] == 'meta', 'io_access_idx'] = -1

## add data_label for vfd
vfd_adf_read = vfd_op_data_label_agg_read(vfd_adf_read,dset_offsets)
# # df_to_csv(vfd_adf_read,fagg,suffix='-vfd-read')


df_to_csv(vol_adf_read,fagg,suffix='-vol-read')
df_to_csv(vfd_adf_read,fagg,suffix='-vfd-read')

df_agg_read_merged = merge_df_agg_read(vol_adf_read,vfd_adf_read,vol_idx2addr_map)
print(df_agg_read_merged.columns)

df_to_csv(df_agg_read_merged,fagg,suffix='-read-merged')
# df_to_parquet(df_agg_read_merged,fagg,suffix='-read-merged')

Index(['data_label_vol', 'io_access_idx_vol', 'dset_name', 'access_size_vol',
       'op_type_vol', 'logical_addr_vol', 'operation_vol', 'n_elements',
       'dimension_cnt', 'dimensions', 'file_intent', 'layout', 'time(us)_vol',
       'logical_addr_map', 'data_label_vfd', 'io_access_idx_vfd',
       'access_size_vfd', 'op_type_vfd', 'next_addr', 'logical_addr_vfd',
       'file_name', 'time(us)_vfd', 'operation_vfd'],
      dtype='object')


## create lables for AGG_WRITE phase
1. create op_type for all 16B, 3KB I/O, and metadata and H5FD_MEM_LHEAP with map 
``` 
op_type_map = { # for notes only now
    'mem_type' : {
        'H5FD_MEM_DRAW' : { 'access_size':{ 16 : 'loc', 'others' : 'data'} },
        'H5FD_MEM_LHEAP' : 'lheap','H5FD_MEM_OHDR': 'meta','H5FD_MEM_SUPER': 'meta'}
}
```
2. create data_label for all H5FD_MEM_DRAW that are not 16B (location data)
3. map data_label with start_addr-3KB to end_addr-16B

In [73]:
from collections import Counter

# map only write

def vfd_op_type_agg_write(df):
    op_type_list = []
    # tmp_row = pd.DataFrame(columns=list(df.columns))
    for index, row in df.iterrows():
        if row['mem_type'] == 'H5FD_MEM_DRAW':
            if row['access_size'] == 16:
                op_type_list.append('loc')
            else:
                op_type_list.append('data')
        elif row['mem_type'] == 'H5FD_MEM_LHEAP':
            op_type_list.append('lheap')
        else:
            op_type_list.append('meta')
    
    df['op_type'] = op_type_list
    return df

        
def vfd_data_label_agg_write(df, PC_IO_SIZE):
    # TODO: needs improvement

    data_idx = 0
    pc_idx = 0
    lheap_idx = 0
        
    data_label_list = []
    loc_map = {}
    data_label_map  = {}
    
    first_chunk_index = 0 # record index at dataframe
    prev_address = df[df['op_type'] == 'data'].iloc[0]['start_addr'] # first data address
    
    for index, row in df.iterrows():
        _pat = row['op_type']
        if _pat == 'data':
            if row['access_size'] == PC_IO_SIZE:
                data_label_list.append(f'vfd-data-point_cloud-{pc_idx}')
                pc_idx +=1
            else:
                _str = f'vfd-data-contact_map-{data_idx}'
                data_label_list.append(_str)
                if first_chunk_index == 0:
                    if row['start_addr'] != prev_address:
                        first_chunk_index = index
                else:
                    data_label_map[row['start_addr']] = _str
                data_idx +=1
                prev_address = row['next_addr']
                
        elif _pat == 'meta':
            data_label_list.append(f'vfd-{_pat}-1')
        elif _pat == 'lheap':
            data_label_list.append(f'vfd-{_pat}-1-{lheap_idx}')
            lheap_idx+=1
        else: 
            if loc_map:
                data_label_list.append('0')
            else:
                data_label_list.append(f'vfd-lheap-1')
            loc_map[row['next_addr']] = int(row['io_access_idx'])
    
    # map some 16B with data_label
    for k in data_label_map.keys():
        data_label_list[loc_map[k]] = data_label_map[k]
    
    # find and map the rest 16B with initial chunk
    indices = [i for i, x in enumerate(data_label_list) if x == '0']
    data_idx = 0
    for idx in indices:
        data_label_list[idx] = f'vfd-data-contact_map-{data_idx}'
        data_idx+=1

    df['data_label'] = data_label_list

def get_point_cloud_io_size(dfvol):
    idx = dfvol.dset_name.str.match('point_cloud').idxmax()
    # dims = ast.literal_eval(lastrow['dimensions'])
    first_dim = dfvol.iloc[idx]['dimensions'][0] #lastrow['dimensions'][idx]
    access_size = dfvol.iloc[idx]['access_size']
    io_size = access_size / first_dim
    return io_size

def hard_code_start_addr_agg(dfvol, dfvfd):
    # # hardcode starting address
    for dset in ['contact_map', 'point_cloud']:
        idx_vfd = dfvfd.data_label.str.match(f'vfd-data-{dset}-0').idxmax()
        START_ADDR = dfvfd.iloc[idx_vfd]['start_addr']
        
        idx_vol = dfvol.data_label.str.match(f'vol-data-{dset}-0').idxmax()
        dfvol.loc[idx_vol, ['logical_addr']] = START_ADDR

        idx_vol = dfvol.data_label.str.match(f'vol-data-{dset}-1').idxmax()
        if idx_vol:
            dfvol.loc[idx_vol, ['logical_addr']] = START_ADDR
    
    return dfvol

def merge_df_agg_write(dfvol, dfvfd):
    df_agg2vol = dfvol[['data_label', 'io_access_idx','dset_name', 'access_size','logical_addr','op_type', 
                        'operation', 'n_elements','dimension_cnt','dimensions','file_intent','layout','time(us)']].copy()
    df_vol2vfd = dfvfd[['data_label', 'io_access_idx', 'access_size', 'start_addr', 'next_addr','op_type',
                        'file_name', 'time(us)','operation']].copy()

    # vol map # set 0 for meta
    df_agg2vol['logical_addr_map'] = df_agg2vol['logical_addr'].astype(int)
    df_agg2vol.loc[df_agg2vol['op_type'].str.match('meta'), 'logical_addr_map'] = 0
    
    # data map
    df_vfd2res_data = df_vol2vfd[df_vol2vfd['op_type'] == 'data'].copy()
    df_vfd2res_data['logical_addr_map'] = df_vfd2res_data['start_addr'].astype(int)
    # modify point_cloud to use only 1 address for mapping 
    idx_pc = df_agg2vol.data_label.str.match(f'vol-data-point_cloud-0').idxmax()
    PC_START_ADDR = int(df_agg2vol.iloc[idx_pc]['logical_addr'])
    df_vfd2res_data.loc[df_vfd2res_data['data_label'].str.contains('point_cloud'), 'logical_addr_map'] = PC_START_ADDR
    
    # loc maps by data_label 
    df_vfd2res_loc = df_vol2vfd[df_vol2vfd['op_type'] == 'loc'].copy()
    vfd2res_loc_map  = dict(zip(df_vfd2res_data['data_label'], df_vfd2res_data['start_addr']))
    df_vfd2res_loc['logical_addr_map'] = df_vfd2res_loc['data_label'].map(vfd2res_loc_map)
    
    # lheap maps by address
    df_vfd2res_lheap = df_vol2vfd[df_vol2vfd['op_type'] == 'lheap'].copy()
    vfd2res_lheap_map = dict(zip(df_vfd2res_data['start_addr'], df_vfd2res_data['next_addr']))
    df_vfd2res_lheap['logical_addr_map'] = df_vfd2res_lheap['next_addr'].map(vfd2res_lheap_map)
    
    # meta maps to 0
    df_vfd2res_meta = df_vol2vfd[df_vol2vfd['op_type'] == 'meta'].copy()
    df_vfd2res_meta['logical_addr_map'] = 0
    
    df_vfd2res = pd.concat([df_vfd2res_data,df_vfd2res_loc,df_vfd2res_lheap,df_vfd2res_meta], axis=0) #.set_index('io_access_idx')
    df_vfd2res = df_vfd2res.rename(columns={'start_addr':'logical_addr'})
    df_merged = df_agg2vol.merge(df_vfd2res, how='inner', on=['logical_addr_map'], suffixes=['_vol','_vfd'])

    # populate file_name
    df_merged['file_name'] = df_merged['file_name'].fillna('aggregate.h5')
    # df_merged['access_size_vol'] = df_merged['access_size_vol'].fillna(0)
    # df_merged['access_size_vol'] = df_merged['access_size_vol'].replace(np.nan, 0)
    
    
    return df_merged

vfd_adf_write = vfd_adf[vfd_adf['cat'] == 'agg-write'].copy().reset_index()
vol_adf_write = vol_adf[vol_adf['cat'] == 'agg-write'].copy().reset_index()

# check_empty(vol_adf_write)

# add data_label for vol
vol_adf_write['dset_name'] = vol_adf_write['dset_name'].fillna(method='bfill')
vol_adf_write['dset_name'] = vol_adf_write['dset_name'].fillna(method='ffill')
add_vol_op_type(vol_adf_write)
vol_data_label(vol_adf_write)
vol_adf_write.loc[vol_adf_write['op_type'] == 'meta', 'io_access_idx'] = -1

vfd_adf_write = vfd_op_type_agg_write(vfd_adf_write)
PC_IO_SIZE = get_point_cloud_io_size(vol_adf_write)
vfd_data_label_agg_write(vfd_adf_write, PC_IO_SIZE)


# get logical address from vfd to vol
vol_adf_write = hard_code_start_addr_agg(vol_adf_write,vfd_adf_write)

df_to_csv(vol_adf_write,fagg,suffix='-vol-write')
df_to_csv(vfd_adf_write,fagg,suffix='-vfd-write')
# df_to_parquet(vol_adf_write,fagg,suffix='-vol-write')
# df_to_parquet(vfd_adf_write,fagg,suffix='-vfd-write')

df_agg_write_merged = merge_df_agg_write(vol_adf_write,vfd_adf_write)
print(df_agg_write_merged.columns)

df_to_csv(df_agg_write_merged,fagg,suffix='-write-merged')
# df_to_parquet(df_agg_write_merged,fagg,suffix='-write-merged')

Index(['data_label_vol', 'io_access_idx_vol', 'dset_name', 'access_size_vol',
       'logical_addr_vol', 'op_type_vol', 'operation_vol', 'n_elements',
       'dimension_cnt', 'dimensions', 'file_intent', 'layout', 'time(us)_vol',
       'logical_addr_map', 'data_label_vfd', 'io_access_idx_vfd',
       'access_size_vfd', 'logical_addr_vfd', 'next_addr', 'op_type_vfd',
       'file_name', 'time(us)_vfd', 'operation_vfd'],
      dtype='object')
