In [1]:
import json
import os
from json_flatten import flatten
import pandas as pd
import uuid
import jmespath

dir_workspace = os.getcwd()

In [2]:
# Helper method to add list object in child data list by creating id reference
def add_to_child_data_list(id_val,parent,child,list_data,list_child_data_dict):
    dict_id = {}
    dict_id['ID'] = id_val
    dict_id['PARENT'] = parent
    dict_id['CHILD'] = child
    dict_id['DATA'] = list_data
    list_child_data_dict.append(dict_id)

In [3]:
# Helper method to remove list object from given dictionary recursively and add it into child data list
def remove_list_and_inject_pk(dict_data,parent,id_val,list_child_data_dict):
    for key in dict_data.keys():
        if isinstance(dict_data[key],list):
            # Removing list and injecting ID value at same place
            list_x = dict_data[key]
            dict_data[key] = id_val
            add_to_child_data_list(id_val,parent,key,list_x,list_child_data_dict)
            
            # Recursion
            for dict_child in list_x:
                remove_list_and_inject_pk(dict_child,key,'#ID#'+str(uuid.uuid1()),list_child_data_dict)
        
        elif isinstance(dict_data[key],dict):
            remove_list_and_inject_pk(dict_data[key],parent,id_val,list_child_data_dict)

In [4]:
# Helper method to process list of dictionary object and assign PK and FK
def process_list(list_data,pk_col,fk_col=None,fk_col_id=None):
    list_data_in = [flatten(d) for d in list_data]
    list_data_out = []
    for dict_in in list_data_in:
        dict_out = {}
        for key in dict_in:
            if '#ID#' in dict_in[key]:
                dict_out[pk_col] = dict_in[key]
            else:
                #dict_out[key] = dict_in[key]
                dict_out[key[key.rfind('.')+1:]] = dict_in[key]
        if(fk_col != None and fk_col_id != None):
            dict_out[fk_col] = fk_col_id
        list_data_out.append(dict_out)
    return list_data_out

In [5]:
# Helper method to create internal ID column for join operation which will be dropped finally
def get_col_name(elm_name):
    return elm_name.upper()+'_ID_DROP'

In [6]:
# Helper method to return filtered json object by jmes path
def get_filter_json(json,instruc_set):
    expression = jmespath.compile(instruc_set)
    return expression.search(json)[0]

In [7]:
# Helper method to merge multiple list which having element name
def process_child_data_list(list_child_data_dict,dict_final_child_data):
    for dict_x in list_child_data_dict:
        child_key = dict_x['PARENT']+'.'+dict_x['CHILD']
        if(child_key in dict_final_child_data):
            list_existing = dict_final_child_data[child_key]['DATA']
            list_out = process_list(dict_x['DATA'],get_col_name(dict_x['CHILD']),get_col_name(dict_x['PARENT']),dict_x['ID'])
            list_existing.extend(list_out)
        else:
            dict_attr = {}
            list_out = process_list(dict_x['DATA'],get_col_name(dict_x['CHILD']),get_col_name(dict_x['PARENT']),dict_x['ID'])
            dict_attr['DATA'] = list_out
            dict_attr['FK_COL'] = get_col_name(dict_x['PARENT'])
            dict_final_child_data[child_key] = dict_attr


In [8]:
# !! Unused methods
def get_filter_set(dict_group,filter_list):
    list_final = []
    for df_name in filter_list:
        list_inner = []
        add_hierarchy(dict_group,list_inner,df_name)
        list_final.extend(list_inner)
    return set(list_final)

def add_hierarchy(dict_group,list_parent,child):
    if (child in dict_group):
        list_parent.append(child)
        if dict_group[child] != 'Root':
            add_hierarchy(dict_group,list_parent,dict_group[child])

In [9]:
# Main method which will return datafrome for given json_data object
def json_to_dataframe(json_data, instruction_set=None, merge=True, drop_id_col=True, debug=False, filter_path=None):
    list_child_data_dict = []
    dict_final_child_data = {}

    if (instruction_set != None):
        expression = jmespath.compile(instruction_set)
        json_data = expression.search(json_data)
        with open('debug.json', 'w') as fp:
            json.dump(json_data, fp)
        

    # Step 0 : Prepare the root list from input json_data
    if isinstance(json_data, dict):
        if len(json_data) == 1 and isinstance(json_data.get(list(json_data.keys())[0]), list):
            list_root = json_data.get(list(json_data.keys())[0])
        else:
            list_root = []
            list_root.append(json_data)
    elif isinstance(json_data, list):
        list_root = json_data
    else:
        print('Exception')

    # Step 1 : Remove list data element from dict and inject primary key id
    for dict_root in list_root:
        remove_list_and_inject_pk(dict_root, 'Root', '#ID#' + str(uuid.uuid1()), list_child_data_dict)

    # Step 2 : process root list
    data_root_out = process_list(list_root, get_col_name('Root'))

    # Step 3 : process child list
    process_child_data_list(list_child_data_dict, dict_final_child_data)

    # Step 4 : Dataframe Creation
    df_root = pd.DataFrame(data_root_out)
    df_final = df_root

    dict_all = {}
    dict_all['root'] = df_root

    # To remove duplicate list data if they have found under different parent
    if(len(list_child_data_dict) > 0):
        df_debug = pd.DataFrame(list_child_data_dict)
        df_group = df_debug[['PARENT', 'CHILD']].drop_duplicates()
        filter_path = 'Root.'+filter_path if filter_path != None else None
        filter_list = [key for key in dict_final_child_data if filter_path == None or key in filter_path]
        
    # Merging of datagroup
    for key in dict_final_child_data:
        df_child = pd.DataFrame(dict_final_child_data[key]['DATA'])
        dict_all[key] = df_child
        if (merge and key in filter_list):
            df_final = pd.merge(df_final, df_child, how='left', left_on=dict_final_child_data[key]['FK_COL'],
                                right_on=dict_final_child_data[key]['FK_COL'])

    # Add meta dataframe for debug purpose
    if (debug):
        df_debug = pd.DataFrame(list_child_data_dict)
        dict_all['meta-debug'] = df_debug
        dict_all['meta-group'] = df_group

    if (drop_id_col):
        cols_to_drop = [c for c in list(df_final.columns) if '_ID_DROP' in c]
        df_final.drop(cols_to_drop, axis=1, inplace=True)

    return df_final, dict_all

In [10]:
from copy import deepcopy

# Price Example
with open(os.path.join(dir_workspace,'data_part.json')) as f:
    json_data = json.load(f)

#instruc_set= 'Entities[].QuantLibCurveSurface.IssueCurveAndSurfaceCharacteristics..{Output : IssueCurveAndSurfaceCharacteristics}'

df_out,dict_all = json_to_dataframe(json_data)
df_out

Unnamed: 0,PreferredInstrumentName,rel,link
0,JP3814000000,self,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
1,JP3814000000,next,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
2,JP3814000000,first,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
3,JP3542400001,self,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
4,JP3542400001,next,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
...,...,...,...
295,JP3216200000,next,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
296,JP3216200000,first,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
297,JP3215800008,self,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
298,JP3215800008,next,http://invm48.gsc.zz:8180/GSOService/gso?gsoNa...
