This notebook will help you understand on how you can parse JSON data using pandas.
We will be using 2 examples

<ol>
    <li>
        <b>Level_1 JSON </b><p>This is the basic JSON which doesn't contain any nested lists/dictionaries and can be parsed directly using pandas <b>read_json</b> function.</p>
    </li>
    <br />
    <li>
        <b>Multiple level JSON</b> 
        <p>This one contains information in nested lists/dictionaries and pandas <b>json_normalize</b> function can be used to normalize or flatten this data.</p>
    </li>
</ol>

In [1]:
# Import required libraries
import json                       # for reading json
import pandas as pd               # for normalizing jsons                     

In [2]:
# Read the JSON data
level1_json_data = pd.read_json('level_1.json')

In [3]:
# This is automatically converted into a table
level1_json_data

Unnamed: 0,Scaler,family_min_samples_percentage,original_number_of_clusters,eps_value,min_samples,number_of_clusters,number_of_noise_samples,adjusted_rand_index,adjusted_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,fowlkes_mallows_score,silhouette_coefficient,calinski_harabasz_score,davies_bouldin_score
0,Standard,5,4,0.1,5,9,72,0.001295,0.008963,0.330713,1.0,0.497046,0.028278,0.65281,10.808223,1.704494
1,Standard,5,4,0.1,10,6,89,0.001085,0.008123,0.294885,1.0,0.455461,0.02636,0.558529,12.528465,1.760859


In [4]:
# lets save this into a tabular format i.e. CSV. We can save it into other formats as well like Excel.
level1_json_data.to_csv('level1_normalized_data.csv', index=False)

In [5]:
# Lets see what we will get if we import it directly using pandas read_json.
# Results are not flattened and put up into a single column.
pd.read_json('multiple_levels.json')

Unnamed: 0,Scaler,family_min_samples_percentage,original_number_of_clusters,Results
0,Standard,5,4,"[{'eps_value': 0.1, 'min_samples': 5, 'number_..."
1,MinMax,5,4,"[{'eps_value': 0.1, 'min_samples': 5, 'number_..."
2,Robust,5,4,"[{'eps_value': 0.1, 'min_samples': 5, 'number_..."
3,Standard,10,2,"[{'eps_value': 0.1, 'min_samples': 5, 'number_..."
4,MinMax,10,2,"[{'eps_value': 0.1, 'min_samples': 5, 'number_..."
5,Robust,10,2,"[{'eps_value': 0.1, 'min_samples': 5, 'number_..."


In [6]:
# As we saw above, we can not use read_json directly, lets see how we can convert it
# Read the JSON data using json python module
with open('multiple_levels.json','r') as f:
    data = json.loads(f.read())

In [7]:
# Here we have multiple of sample records which is mentioned above.
data

[{'Scaler': 'Standard',
  'family_min_samples_percentage': 5,
  'original_number_of_clusters': 4,
  'Results': [{'eps_value': 0.1,
    'min_samples': 5,
    'number_of_clusters': 9,
    'number_of_noise_samples': 72,
    'scores': {'adjusted_rand_index': 0.0012946494377947854,
     'adjusted_mutual_info_score': 0.008962599716725805,
     'homogeneity_score': 0.3307127972087838,
     'completeness_score': 0.9999999999999999,
     'v_measure_score': 0.4970460912414239,
     'fowlkes_mallows_score': 0.028277808486895997,
     'silhouette_coefficient': 0.6528095225386412,
     'calinski_harabasz_score': 10.808222558523699,
     'davies_bouldin_score': 1.7044935031867752}},
   {'eps_value': 0.1,
    'min_samples': 10,
    'number_of_clusters': 6,
    'number_of_noise_samples': 89,
    'scores': {'adjusted_rand_index': 0.0010853141067694784,
     'adjusted_mutual_info_score': 0.008123099381390967,
     'homogeneity_score': 0.29488451142983285,
     'completeness_score': 0.9999999999999996,
 

In [8]:
# record_path- which is basically path to the array we would like to flatten
# See how just using record_path we got our list flattened out.
pd.json_normalize(data, record_path=['Results'])

Unnamed: 0,eps_value,min_samples,number_of_clusters,number_of_noise_samples,scores.adjusted_rand_index,scores.adjusted_mutual_info_score,scores.homogeneity_score,scores.completeness_score,scores.v_measure_score,scores.fowlkes_mallows_score,scores.silhouette_coefficient,scores.calinski_harabasz_score,scores.davies_bouldin_score
0,0.1,5,9,72,0.001295,0.0089626,0.330713,1.0,0.497046,0.028278,0.65281,10.808223,1.704494
1,0.1,10,6,89,0.001085,0.008123099,0.294885,1.0,0.455461,0.02636,0.558529,12.528465,1.760859
2,0.3,5,6,68,0.00081,0.007186917,0.267258,1.0,0.421789,0.023607,0.636762,17.124353,1.630219
3,0.3,10,5,73,0.000781,0.006988971,0.256727,1.0,0.408564,0.023294,0.613624,18.087761,1.68819
4,0.1,5,6,62,0.00083,0.007275885,0.270214,1.0,0.425462,0.023811,0.69512,51.723076,1.255363
5,0.1,10,5,68,0.000799,0.00707338,0.259552,1.0,0.412134,0.023487,0.665411,52.010288,1.313237
6,0.3,5,6,59,0.000842,0.007338106,0.272775,1.0,0.42863,0.023935,0.706925,56.322686,1.2472
7,0.3,10,5,65,0.000809,0.007121272,0.261215,1.0,0.414228,0.023596,0.676267,55.338439,1.306437
8,0.1,5,9,86,0.00116,0.008582246,0.319768,1.0,0.484583,0.027061,0.485934,19.080218,1.14782
9,0.1,10,6,102,0.000966,0.007744081,0.284051,1.0,0.44243,0.025203,0.528727,19.547596,1.209146


In [9]:
# Although the list is flattened we are still missing the other information which
# we would like to have with each record like we would like to know what is the Scaler, 
# original_number_of_clusters 
# Meta parameter is the way to go which keeps details about the fields to the record_path
# Scroll to the right to see our fields are added.
pd.json_normalize(data, record_path=['Results'], \
                    meta=['original_number_of_clusters','Scaler','family_min_samples_percentage'])

Unnamed: 0,eps_value,min_samples,number_of_clusters,number_of_noise_samples,scores.adjusted_rand_index,scores.adjusted_mutual_info_score,scores.homogeneity_score,scores.completeness_score,scores.v_measure_score,scores.fowlkes_mallows_score,scores.silhouette_coefficient,scores.calinski_harabasz_score,scores.davies_bouldin_score,original_number_of_clusters,Scaler,family_min_samples_percentage
0,0.1,5,9,72,0.001295,0.0089626,0.330713,1.0,0.497046,0.028278,0.65281,10.808223,1.704494,4,Standard,5
1,0.1,10,6,89,0.001085,0.008123099,0.294885,1.0,0.455461,0.02636,0.558529,12.528465,1.760859,4,Standard,5
2,0.3,5,6,68,0.00081,0.007186917,0.267258,1.0,0.421789,0.023607,0.636762,17.124353,1.630219,4,Standard,5
3,0.3,10,5,73,0.000781,0.006988971,0.256727,1.0,0.408564,0.023294,0.613624,18.087761,1.68819,4,Standard,5
4,0.1,5,6,62,0.00083,0.007275885,0.270214,1.0,0.425462,0.023811,0.69512,51.723076,1.255363,4,MinMax,5
5,0.1,10,5,68,0.000799,0.00707338,0.259552,1.0,0.412134,0.023487,0.665411,52.010288,1.313237,4,MinMax,5
6,0.3,5,6,59,0.000842,0.007338106,0.272775,1.0,0.42863,0.023935,0.706925,56.322686,1.2472,4,MinMax,5
7,0.3,10,5,65,0.000809,0.007121272,0.261215,1.0,0.414228,0.023596,0.676267,55.338439,1.306437,4,MinMax,5
8,0.1,5,9,86,0.00116,0.008582246,0.319768,1.0,0.484583,0.027061,0.485934,19.080218,1.14782,4,Robust,5
9,0.1,10,6,102,0.000966,0.007744081,0.284051,1.0,0.44243,0.025203,0.528727,19.547596,1.209146,4,Robust,5


In [10]:
# Now many times we would like to add a string to each field we flattened out.
# For dictionaries this function automatically appends the parent dictionary name
# For lists we can use 'prefix'.
# the meta data we had is basically the config_params for the problem and we can use meta_prefix for that.
# other attributes are basically related to DBSCAN algorithm and we can use record_prefix to show that.
multiple_level_data = pd.json_normalize(data, record_path=['Results'], \
                    meta=['original_number_of_clusters','Scaler','family_min_samples_percentage'],
                 meta_prefix='config_params_', record_prefix='dbscan_')

In [11]:
multiple_level_data

Unnamed: 0,dbscan_eps_value,dbscan_min_samples,dbscan_number_of_clusters,dbscan_number_of_noise_samples,dbscan_scores.adjusted_rand_index,dbscan_scores.adjusted_mutual_info_score,dbscan_scores.homogeneity_score,dbscan_scores.completeness_score,dbscan_scores.v_measure_score,dbscan_scores.fowlkes_mallows_score,dbscan_scores.silhouette_coefficient,dbscan_scores.calinski_harabasz_score,dbscan_scores.davies_bouldin_score,config_params_original_number_of_clusters,config_params_Scaler,config_params_family_min_samples_percentage
0,0.1,5,9,72,0.001295,0.0089626,0.330713,1.0,0.497046,0.028278,0.65281,10.808223,1.704494,4,Standard,5
1,0.1,10,6,89,0.001085,0.008123099,0.294885,1.0,0.455461,0.02636,0.558529,12.528465,1.760859,4,Standard,5
2,0.3,5,6,68,0.00081,0.007186917,0.267258,1.0,0.421789,0.023607,0.636762,17.124353,1.630219,4,Standard,5
3,0.3,10,5,73,0.000781,0.006988971,0.256727,1.0,0.408564,0.023294,0.613624,18.087761,1.68819,4,Standard,5
4,0.1,5,6,62,0.00083,0.007275885,0.270214,1.0,0.425462,0.023811,0.69512,51.723076,1.255363,4,MinMax,5
5,0.1,10,5,68,0.000799,0.00707338,0.259552,1.0,0.412134,0.023487,0.665411,52.010288,1.313237,4,MinMax,5
6,0.3,5,6,59,0.000842,0.007338106,0.272775,1.0,0.42863,0.023935,0.706925,56.322686,1.2472,4,MinMax,5
7,0.3,10,5,65,0.000809,0.007121272,0.261215,1.0,0.414228,0.023596,0.676267,55.338439,1.306437,4,MinMax,5
8,0.1,5,9,86,0.00116,0.008582246,0.319768,1.0,0.484583,0.027061,0.485934,19.080218,1.14782,4,Robust,5
9,0.1,10,6,102,0.000966,0.007744081,0.284051,1.0,0.44243,0.025203,0.528727,19.547596,1.209146,4,Robust,5


In [12]:
# lets save this into a tabular format i.e. CSV. We can save it into other formats as well like Excel.
multiple_level_data.to_csv('multiplelevel_normalized_data.csv', index=False)