# Read a json file

Json is "the one with the curly brackets"
Many programs - and especially APIs - output json formatted data

Good articles on json include https://realpython.com/python-json/

Python libraries and functions in them include: 
* Library json, functions load, loads
* Library pandas, functions read_json, json_normalize

In [1]:
# load in the libraries you need, and point at your json file
import json
import pandas as pd

jsonroot = 'data/smat_outputs/response_1634667265142'
jsonfile = jsonroot + '.json'

In [2]:
# read the file with library json
with open(jsonfile, "r") as read_file:
    jsondata = json.load(read_file)

jsondata

{'created_key': 'now',
 'content_key': 'htmlparsedcom',
 'took': 1108,
 'timed_out': False,
 '_shards': {'total': 2, 'successful': 2, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 9, 'relation': 'eq'},
  'max_score': None,
  'hits': [{'_index': 'smat-4chan-data-000002',
    '_type': '_doc',
    '_id': 'news_941273',
    '_score': None,
    '_ignored': ['htmlparsedcom.keyword', 'com.keyword'],
    '_source': {'board': 'news',
     'chan': 1,
     'id': 560000941273,
     'urls': [],
     'v': 1.7,
     'com': 'Investigating the attack — which resulted in injuries to more than 150 police officers — is a key priority for Congressional Democrats.<br><br>Trump’s move could also create a dilemma for the Biden administration. Most presidents are loath to weaken executive privilege, the vaguely defined right for presidents to have conversations with their advisers without worrying those conversations may someday become public.<br><br>The Constitution does not mention the concept, bu

In [3]:
# The data's in ['hits']['hits']. Check how much there is
print("{} data entries in here".format(len(jsondata['hits']['hits'])))

9 data entries in here


In [4]:
# read the data in using library pandas.  
# Notice how this get the top-level items, not the data rows
df = pd.read_json(jsonfile)
df

Unnamed: 0,created_key,content_key,took,timed_out,_shards,hits
total,now,htmlparsedcom,1108,False,2.0,"{'value': 9, 'relation': 'eq'}"
successful,now,htmlparsedcom,1108,False,2.0,
skipped,now,htmlparsedcom,1108,False,0.0,
failed,now,htmlparsedcom,1108,False,0.0,
max_score,now,htmlparsedcom,1108,False,,
hits,now,htmlparsedcom,1108,False,,"[{'_index': 'smat-4chan-data-000002', '_type':..."


In [5]:
# function json_normalize to the rescue!  
# Read data in using the json library, then convert just the data array
dfdata = pd.json_normalize(jsondata['hits']['hits'])
dfdata

Unnamed: 0,_index,_type,_id,_score,_ignored,sort,_source.board,_source.chan,_source.id,_source.urls,...,_source.filename,_source.fsize,_source.h,_source.mimg,_source.md5,_source.tim,_source.tnh,_source.tnw,_source.w,_source.trip
0,smat-4chan-data-000002,_doc,news_941273,,"[htmlparsedcom.keyword, com.keyword]",[1633631757000],news,1,560000941273,[],...,,,,,,,,,,
1,smat-4chan-data-000002,_doc,tv_157006540,,,[1633002982000],tv,1,670000157006540,[],...,,,,,,,,,,
2,smat-4chan-data-000001,_doc,sp_113574173,,,[1632556011000],sp,1,630000113574173,[],...,,,,,,,,,,
3,smat-4chan-data-000001,_doc,pol_340727489,,,[1632512927000],pol,1,agF8ZtES,[https://i.4cdn.org/pol/1632527327341.jpg],...,1632527319122,113267.0,1280.0,1.0,iiW4jL9s7LJhBJmwjYgttQ==,1632527000000.0,125.0,57.0,591.0,
4,smat-4chan-data-000001,_doc,pol_339658457,,"[htmlparsedcom.keyword, com.keyword]",[1631938886000],pol,1,xs/Q8Mvw,[],...,,,,,,,,,,
5,smat-4chan-data-000001,_doc,b_863119780,,,[1631677847000],b,1,20000863119780,[https://i.4cdn.org/b/1631692247962.jpg],...,jenner-intro-1526047822,50148.0,439.0,,AuOIU+6pavTKhKPXbej+8g==,1631692000000.0,70.0,125.0,780.0,
6,smat-4chan-data-000001,_doc,pol_339135260,,"[htmlparsedcom.keyword, com.keyword]",[1631641278000],pol,1,YNF6pBIC,[],...,,,,,,,,,,!!T2UdrWkLSWB
7,smat-4chan-data-000001,_doc,int_151673243,,,[1631397058000],int,1,500000151673243,[],...,,,,,,,,,,
8,smat-4chan-data-000001,_doc,news_923880,,[com.keyword],[1630876042000],news,1,560000923880,[],...,,,,,,,,,,


In [6]:
# Feeling cute. Dumping dataset to a CSV file so we can use it elsewhere
dfdata.to_csv(jsonroot + '.csv', index=False)