### <font color="brown">Working with CSV Datasets</font>

In [None]:
import csv

#### Example 3: Processing auto-mpg CSV file Using DictReader

In [None]:
# using DictReader on csv
reader = csv.DictReader(open('auto_mpg_original.csv'))
for index,row in enumerate(reader):
    print(row)
    if index > 3:
        break

**Print header column names, and all lines that have an NA for any of the fields**

In [None]:
# using fieldnames and values methods
reader = csv.DictReader(open('auto_mpg_original.csv'))
print(reader.fieldnames)
print(','.join(reader.fieldnames))
for row in reader:
    values = list(row.values())  # need to cast row.values() to list
    if 'NA' in values:
        values[-1] = '"' + values[-1] + '"'
        print(','.join(values))

**Write out a cleaned up version into a CSV file**

In [None]:
reader = csv.DictReader(open('auto_mpg_original.csv'))  # input, has a bunch of NAs for values
with open('auto_mpg.csv','w') as csvfile:               # output, delete lines with NA for any value
    csvfile.write(','.join(reader.fieldnames)+'\n')     # header line with field names    
    for row in reader:
        if 'NA' in row.values():
            continue
        values = list(row.values())
        csvfile.write(','.join(values)+'\n')


**Alternatively, you can use a CSV DictWriter writer to write out**

In [None]:
with open('auto_mpg_original.csv') as csvfile: 
    reader = csv.DictReader(csvfile)
    
    with open('auto_mpg.csv','w',newline='') as csvout:
        # fieldnames is a required parameter for DictWriter
        writer = csv.DictWriter(csvout,fieldnames=reader.fieldnames, delimiter='\t')  
        writer.writeheader()   
        for row in reader:
            if 'NA' in row.values():
                continue
            writer.writerow(row)  

---

### <font color="brown">Working with JSON (JavaScript Object Notation) Datasets</font>

In [None]:
import json

---

#### <font color="brown">Loading a JSON-formatted string into a JSON object</font>

In [None]:
json1 = '{"hill center":"Busch", "AB":"College Ave"}'   # a string containing dictionary formatted data
# load this into Python
dict1 = json.loads(json1)
print(dict1)
print(dict1.keys())
print(dict1.values())

In [None]:
json1 = {"hill center":"Busch", "AB":"College Ave"}   
# load this into Python
dict1 = json.loads(json1)
print(dict1)

**Above doesn't work - the input must be a string, so need quotes around the whole thing**

In [None]:
json1 = '{2:"Busch", 1:"College Ave"}'  
# load this into Python
dict1 = json.loads(json1)
print(dict1)

**Keys are required to be strings, so the numbers 2 and 1 as keys are rejected**

**But values are not required to be strings**

In [None]:
json1 = '{"John":12, "Jane":25}'   # but values need not be strings
# load this into Python
dict1 = json.loads(json1)
print(dict1)

In [None]:
x =  '{ "name":"John", "age":30, "city":"New York"}'
y = json.loads(x)
print(y)
print(y["age"])

**Key strings are required to be double-quoted**

In [None]:
x =  "{ 'name':'John', 'age':30, 'city':'New York'}"
y = json.loads(x)

**Above doesn't work, because key strings are required to be double-quoted**

---

#### <font color="brown">Dumping a dictonary to JSON-formatted string</font>

In [None]:
dat_dict = { 'name' : 'Jane', 'age' : 25, 'city' : 'Chicago'}
dat_str = json.dumps(dat_dict)
print(dat_str)

In [None]:
# a dictionary with integers for keys
dict2 = {2: 'busch', 1: 'college ave'}
print(dict2)

In [None]:
# dump to string 
dict2_str = json.dumps(dict2)
print(dict2_str)  

**<font color="red">1. When dumping, integer keys converted to strings, single-quoted strings are double-quoted</font>**

In [None]:
dict2_new = json.loads(dict2_str)
print(dict2_new)   

**<font color="red">2. So when loading back, dict keys change to strings so dict2 is NOT the same as dict2_new<font>**

---

#### <font color="brown">Using arrays as values</font>

In [None]:
# array of integers
json3 = '{"name": "Anika", "quiz_scores":[38,40,36,40,32]}'
dict3 = json.loads(json3)
print(dict3['quiz_scores'][2])

In [None]:
# array of dictionaries
json4 = '{"quiz_scores" : [{"name": "Anika", "scores": [38,40,36,40,32]}, {"name": "Amir", "scores":[36,38,40,30,34]}]}'
dict4 = json.loads(json4)
print(dict4)

In [None]:
print(dict4['quiz_scores'][1]['name'])  # name of second item in quiz_scores value array
print(dict4['quiz_scores'][0]['scores'][3])  # 4th score of first item in quiz_scores value array

---

#### <font color="brown">Storing JSON to file</font>

In [None]:
# dump to file
with open ("quiz_scores.json","w") as qsfile:
    json.dump(dict4, qsfile)

In [None]:
# load from file
with open("quiz_scores.json") as qsfile:
    qs_scores = json.load(qsfile)

In [None]:
print(qs_scores)

---

#### <font color="brown">JSON with just a string (no dictionary)</font>

In [None]:
# string must be double-quoted
jsonstr = json.loads('"JSON - JavaScript Object Notation"')
jsonstr

---

#### <font color="brown">JSON with just an array</font>

In [None]:
jsonarr = json.loads('[1,2,2,4]')
print(jsonarr)
print(len(jsonarr))

---

#### <font color="brown">JSON with just a number</font>

In [None]:
jsonint = json.loads('25')
print(type(jsonint))
jsonreal = json.loads('25.3')
print(type(jsonreal))

In [None]:
json.loads('12.x')

In [None]:
json.loads('"12.x"')   # this is a string

---

#### <font color="brown">JSON with just a boolean</font>

In [None]:
jsonbool = json.loads('true')   # must be lowercase
print(jsonbool)
print(type(jsonbool))

---

#### <font color="brown">JSON with a null</font>

In [None]:
jsonnull = json.loads('null')
print(jsonnull)
print(type(jsonnull))

---

#### Exercise: ad hoc format converted storage to JSON

Suppose scores were in a file *qs_scores.txt*, like this:
    
Anika Sorenson|38,40,36,40,32<br>
Amir Sharif|36,38,40,30,34

We want to store this in JSON form so that it is standardized

In [None]:
# make an input text file, qs-scores.txt
qs_dict = {}
for line in open('qs_scores.txt'):
    flds = line.split('|')
    scores = flds[1].strip().split(',')
    qs_scores = [int(qs) for qs in scores]
    qs_dict[flds[0].strip()] = qs_scores
print(qs_dict)

In [None]:
with open('qs_scores.json','w') as qsfile:
    json.dump(qs_dict, qsfile)

# double-click the output file, will open in json interpretation mode
# right-click -> open with editor, can see plain text

---

#### <font color="brown">Getting JSON data from a Web page<font>

In [None]:
import requests

#### Example of reading public JSON dataset

Nobel Prizes - http://api.nobelprize.org/v1/prize.json

In [None]:
nobel_url = ' http://api.nobelprize.org/v1/prize.json'
resp = requests.get(nobel_url)
nobels = json.loads(resp.text)

**nobels is a dictionary with a single key, 'prizes'**

In [None]:
print(nobels.keys())

**the value for 'prizes' is a list**

In [None]:
len(nobels['prizes'])

**list is of length 658, one item per prize**

In [None]:
print(nobels['prizes'][0])

**each list item is a dictionary**

**<font color="brown">Get all prizes awarded in the year 2021</font>**

In [None]:
nobels_2021 = [prize for prize in nobels['prizes'] if prize['year'] == '2021']
print(nobels_2021)

**This is TMI and quite hard to read, we want to write in a user-friendly format**

**We want:**<br>
    Chemistry: name1, name2 ...<br>
    Economics: name1, name2 ...


In [None]:
for prize in nobels_2021:
    print(prize['category'].capitalize() + ': ',end='')
    winners = [winner['firstname']+' '+winner['surname'] for winner in prize['laureates']]
    print(', '.join(winners))

**<font color="brown">Get all prizes awarded in the year 2020</font>**

In [None]:
nobels_2020 = [prize for prize in nobels['prizes'] if prize['year'] == '2020']
for prize in nobels_2020:
    print(prize['category'].capitalize() + ': ',end='')
    winners = [winner['firstname']+' '+winner['surname'] for winner in prize['laureates']]
    print(', '.join(winners))

**Surname missing in Peace prize, could be missing in other years as well<br>
Use dict get method with default return of empty string if key not found**

In [None]:
for prize in nobels_2020:
    print(prize['category'].capitalize() + ': ',end='')
    winners = [winner['firstname']+' '+winner.get('surname','') for winner in prize['laureates']]
    print(', '.join(winners))

---

#### <font color="brown">Basic JSON structure: https://www.json.org/json-en.html</font>
As the description says at the top, JSON is built on two structures (using Python corresponding terminology: dictionary (key-value pairs), and lists (arrays)