In [1]:
import pandas as pd
import numpy as np

In [2]:
url = "https://raw.githubusercontent.com/chrisalbon/simulated_datasets/master/data.json"

### how to pull a json from url

In [3]:
first_json = pd.read_json(url)
first_json.head()

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0
2,9,2015-01-01 00:00:02,0
3,6,2015-01-01 00:00:03,0
4,6,2015-01-01 00:00:04,0


Writing the JSON data is as simple as reading and is one line of code. Instead of read_json(), you will use to_json() with a filename and that's all!

In [4]:
first_json.to_json('json_columns.json', orient="columns")
first_json.to_json('json_index.json', orient="index")

read_json() and to_json() works only with simple JSON. All arrays inside need to have arrays of same length.

In [7]:
# see, this wont work. YOU CANT READ_JSON ON NESTED JSONS
df = pd.read_json("nested.json")

ValueError: All arrays must be of the same length

In [8]:
import json # To read NESTED JSON, import json and use this with
#load json object
with open('nested.json') as f:
    nested_json = json.load(f)
print(nested_json)
print(type(nested_json))

{'article': [{'id': '01', 'language': 'JSON', 'edition': 'first', 'author': 'Allen'}, {'id': '02', 'language': 'Python', 'edition': 'second', 'author': 'Aditya Sharma'}], 'blog': [{'name': 'Datacamp', 'URL': 'datacamp.com'}]}
<class 'dict'>


In [24]:
from pandas import json_normalize # NORMALIZE!
json_normalize(nested_json)

Unnamed: 0,article,blog
0,"[{'id': '01', 'language': 'JSON', 'edition': '...","[{'name': 'Datacamp', 'URL': 'datacamp.com'}]"


In [16]:
# We can use pprint for printing dictionaries for reading
from pprint import pprint

print("---print---")
print(nested_json)
print("----pprint----")
pprint(nested_json) # nice!

---print---
{'article': [{'id': '01', 'language': 'JSON', 'edition': 'first', 'author': 'Allen'}, {'id': '02', 'language': 'Python', 'edition': 'second', 'author': 'Aditya Sharma'}], 'blog': [{'name': 'Datacamp', 'URL': 'datacamp.com'}]}
----pprint----
{'article': [{'author': 'Allen',
              'edition': 'first',
              'id': '01',
              'language': 'JSON'},
             {'author': 'Aditya Sharma',
              'edition': 'second',
              'id': '02',
              'language': 'Python'}],
 'blog': [{'URL': 'datacamp.com', 'name': 'Datacamp'}]}


In [25]:
blog = json_normalize(nested_json,record_path ='blog')
blog.head()

Unnamed: 0,name,URL
0,Datacamp,datacamp.com


In [26]:
article = json_normalize(nested_json,record_path ='article')
article.head()

Unnamed: 0,id,language,edition,author
0,1,JSON,first,Allen
1,2,Python,second,Aditya Sharma


In [27]:
# json_normalize() has 3 parameters
# data - input data
# record_path - nested elements
# meta - let them as they are elements

In [28]:
# define json string
data = [{"state": "Florida", 
        "shortname": "FL",
        "info": {"governor": "Rick Scott"},
        "counties": [{"name": "Dade", "population": 12345},
                     {"name": "Broward", "population": 40000},
                     {"name": "Palm Beach", "population": 60000}]},
       {"state": "Ohio",
        "shortname": "OH",
        "info": {"governor": "John Kasich"},
        "counties": [{"name": "Summit", "population": 1234},
                     {"name": "Cuyahoga", "population": 1337}]}]

### most important way to filter down a json!

In [29]:
json_normalize(data)
json_normalize(data=data, record_path='counties', meta=['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich
