# Chapter 6 - Data Loading, Storage, and File Formats

## 6.1 Reading and Writing Data in Text Format

In [1]:
import sys
import json

import pandas as pd
import numpy as np
import xlrd

### `pd.read_csv()`

1a. Vanilla `pd.read_csv()` with no additional specified parameters

1b. `read_csv()` with a specified index column, using `index_col`

1c. `read_csv()` with self-defined headers, using `names`

1d. `read_csv()` with different delimiters, using `sep`


In [2]:
# pd.read_csv(). This function uses all the default
# parameters for those that are not specified
df = pd.read_csv('dataset-D-wines.csv')
display(df.head(3))
print(df.shape)

Unnamed: 0,id,variety,points,price
0,99759,Chardonnay,88,36.0
1,123471,Pinot Noir,85,50.0
2,81339,Ugni Blanc-Colombard,85,10.0


(20, 4)


In [3]:
# read_csv() with one column used as the index. 
df2 = pd.read_csv('dataset-B-membership.csv', index_col=['year'])
display(df2.head(3))

Unnamed: 0_level_0,membership
year,Unnamed: 1_level_1
2009,526089
2010,549878
2011,588014


In [4]:
# read_csv() with specified column names. Note that the dataset
# does not have headers.
df3 = pd.read_csv('dataset-A2-loans.csv', 
                  names = ['id', 'loan_amnt', 'int_rate', 'term', 'grade'])
display(df3.head(3))

Unnamed: 0,id,loan_amnt,int_rate,term,grade
0,48304290,30000.0,8.18,36 months,B
1,49904421,14225.0,13.33,60 months,C
2,49904421,12000.0,20.2,60 months,E


In [5]:
# read_csv() with different types of separators. 
df4 = pd.read_csv('dataset-D2-wines.csv', sep='\t')
display(df4.head(3))

Unnamed: 0,id,variety,points,price
0,99759,Chardonnay,88,36.0
1,123471,Pinot Noir,85,50.0
2,81339,Ugni Blanc-Colombard,85,10.0


### `df.to_csv()`

In [6]:
# Use df.to_csv() to write data to a CSV file. Note that sys.stdout prints this to the console
# for demonstration purposes.
# index=False will remove the index of the df, leaving only the cols
df[:5].to_csv(sys.stdout, index=False)
# df[:5].to_csv('file.csv' index=False)

id,variety,points,price
99759,Chardonnay,88,36.0
123471,Pinot Noir,85,50.0
81339,Ugni Blanc-Colombard,85,10.0
72999,Sauvignon Blanc,85,14.0
99586,Cabernet Sauvignon,92,65.0


<hr>

### JSON Data

1. Read from a `.json` file to a Python `dict()`

2. Convert interested labels to a `df`

In [7]:
# Use json.load(file) to read from JSON file to a dict
with open ('dataset-E-PSI.json', 'r') as f:
    psi_json = json.load(f)

In [8]:
# Consider the following: 
# psi_json['region_metadata'] contains the lat and long coordinates of some locations
region_d = psi_json['region_metadata']
display(region_d)

[{'name': 'west', 'label_location': {'latitude': 1.35735, 'longitude': 103.7}},
 {'name': 'national', 'label_location': {'latitude': 0, 'longitude': 0}},
 {'name': 'east',
  'label_location': {'latitude': 1.35735, 'longitude': 103.94}},
 {'name': 'central',
  'label_location': {'latitude': 1.35735, 'longitude': 103.82}},
 {'name': 'south',
  'label_location': {'latitude': 1.29587, 'longitude': 103.82}},
 {'name': 'north',
  'label_location': {'latitude': 1.41803, 'longitude': 103.82}}]

In [9]:
# To convert the above to a df, first, iterate through all the dict()
# and extract the latitude and longitude to the higher level.
for r in region_d:
    r['latitude'] = r['label_location']['latitude']
    r['longitude'] = r['label_location']['longitude']    
    del r['label_location']
display(region_d)    

[{'name': 'west', 'latitude': 1.35735, 'longitude': 103.7},
 {'name': 'national', 'latitude': 0, 'longitude': 0},
 {'name': 'east', 'latitude': 1.35735, 'longitude': 103.94},
 {'name': 'central', 'latitude': 1.35735, 'longitude': 103.82},
 {'name': 'south', 'latitude': 1.29587, 'longitude': 103.82},
 {'name': 'north', 'latitude': 1.41803, 'longitude': 103.82}]

In [10]:
# Then, simply use pd.DataFrame() to convert to a df
location_df = pd.DataFrame(region_d, columns=['name', 'latitude', 'longitude'])
display(location_df)
# Notice that central, south and north share the same longitude while 
# east, central and west share the same latitude. This confirms that lat is the x-axis while
# long is the y-axis 

Unnamed: 0,name,latitude,longitude
0,west,1.35735,103.7
1,national,0.0,0.0
2,east,1.35735,103.94
3,central,1.35735,103.82
4,south,1.29587,103.82
5,north,1.41803,103.82


In [11]:
# Similarly, consider the readings result
# Consider the following dict that needs to be parsed into a dict
readings_dict = psi_json['items'][0]['readings']
display(readings_dict)

{'o3_sub_index': {'west': 7,
  'national': 10,
  'east': 8,
  'central': 10,
  'south': 4,
  'north': 7},
 'pm10_twenty_four_hourly': {'west': 22,
  'national': 27,
  'east': 27,
  'central': 18,
  'south': 24,
  'north': 18},
 'pm10_sub_index': {'west': 22,
  'national': 27,
  'east': 27,
  'central': 18,
  'south': 24,
  'north': 18},
 'co_sub_index': {'west': 5,
  'national': 5,
  'east': 4,
  'central': 3,
  'south': 4,
  'north': 4},
 'pm25_twenty_four_hourly': {'west': 10,
  'national': 13,
  'east': 13,
  'central': 8,
  'south': 9,
  'north': 9},
 'so2_sub_index': {'west': 11,
  'national': 11,
  'east': 6,
  'central': 6,
  'south': 8,
  'north': 4},
 'co_eight_hour_max': {'west': 0.45,
  'national': 0.45,
  'east': 0.39,
  'central': 0.32,
  'south': 0.42,
  'north': 0.44},
 'no2_one_hour_max': {'west': 29,
  'national': 29,
  'east': 6,
  'central': 9,
  'south': 14,
  'north': 17},
 'so2_twenty_four_hourly': {'west': 17,
  'national': 17,
  'east': 9,
  'central': 9,
  'sou

In [12]:
# First, iterate through all values to construct the final dict
readings_list = []
for k, v in readings_dict.items():
    r_dict = dict()
    r_dict['reading_name'] = k
    for k2, v2 in v.items():
        r_dict[k2] = v2
    readings_list.append(r_dict)   

In [13]:
readings_df = pd.DataFrame(readings_list, 
                           columns=['reading_name', 'south','north', 'central', 'west', 'east', 'national'])
display(readings_df)

Unnamed: 0,reading_name,south,north,central,west,east,national
0,o3_sub_index,4.0,7.0,10.0,7.0,8.0,10.0
1,pm10_twenty_four_hourly,24.0,18.0,18.0,22.0,27.0,27.0
2,pm10_sub_index,24.0,18.0,18.0,22.0,27.0,27.0
3,co_sub_index,4.0,4.0,3.0,5.0,4.0,5.0
4,pm25_twenty_four_hourly,9.0,9.0,8.0,10.0,13.0,13.0
5,so2_sub_index,8.0,4.0,6.0,11.0,6.0,11.0
6,co_eight_hour_max,0.42,0.44,0.32,0.45,0.39,0.45
7,no2_one_hour_max,14.0,17.0,9.0,29.0,6.0,29.0
8,so2_twenty_four_hourly,13.0,7.0,9.0,17.0,9.0,17.0
9,pm25_sub_index,39.0,37.0,34.0,41.0,52.0,52.0


<hr>
## 6.2 Binary Data Formats

### Reading Microsoft Excel Files

In [14]:
# To read from MS Excel files, use pd.ExcelFile(). This is very useful
# if there are multiple sheets.
xlsx = pd.ExcelFile('dataset-F-income.xlsx')
# Here, state the sheet title to read from the Excel file to a df.
# Note that additional parameters skiprows and skip_footer is added due to the nature of the raw Excel file.
df_y = pd.read_excel(xlsx, 'Title', skiprows=5, skip_footer=20) # Skip first 5 rows, only preserve 20 rows
df_y.tail(3)

Unnamed: 0,Year,HDB 1- & 2- Room Flats 1/,HDB 3-Room Flats,HDB 4-Room Flats,HDB 5-Room & Executive Flats,Condominiums & Other Apartments,Landed Properties
16,2016,2418,5655,7971,10947,18797,24231
17,2017,2435,5730,8209,11226,19072,25327
18,2018,2460,5767,8248,11392,19116,25724


In [15]:
# Alternatively if there is only 1 sheet, juse use pd.read_excel(filename, sheet_name)
df_y2 = pd.read_excel('dataset-F-income.xlsx', 'Title', skiprows=5, skip_footer=20)
display(df_y2.head(2))
display(df_y2.tail(2))

Unnamed: 0,Year,HDB 1- & 2- Room Flats 1/,HDB 3-Room Flats,HDB 4-Room Flats,HDB 5-Room & Executive Flats,Condominiums & Other Apartments,Landed Properties
0,2000,1762,3369,4252,6279,11431,14398
1,2001,1521,3432,4359,6493,12280,14113


Unnamed: 0,Year,HDB 1- & 2- Room Flats 1/,HDB 3-Room Flats,HDB 4-Room Flats,HDB 5-Room & Executive Flats,Condominiums & Other Apartments,Landed Properties
17,2017,2435,5730,8209,11226,19072,25327
18,2018,2460,5767,8248,11392,19116,25724


<hr>
## 6.3 Interacting with Web APIs

1. Given some endpoint called `address`, get the response from the API endpoint using `requests.get(address)` 

2. Given `resp=requests.get(address)`, use `resp.text()` to get the text representation of the response.

In [16]:
# Code snippet to call PSI API from data.gov.sg
import requests

psi = requests.get("https://api.data.gov.sg/v1/environment/psi")
j = psi.text
j_dict = json.loads(j)
print(j_dict)

# Use the lines below to write to a JSON file
# with open ('dataset-X.json', 'w') as f:
#     f.write(json.dumps(j_dict))

{'region_metadata': [{'name': 'west', 'label_location': {'latitude': 1.35735, 'longitude': 103.7}}, {'name': 'national', 'label_location': {'latitude': 0, 'longitude': 0}}, {'name': 'east', 'label_location': {'latitude': 1.35735, 'longitude': 103.94}}, {'name': 'central', 'label_location': {'latitude': 1.35735, 'longitude': 103.82}}, {'name': 'south', 'label_location': {'latitude': 1.29587, 'longitude': 103.82}}, {'name': 'north', 'label_location': {'latitude': 1.41803, 'longitude': 103.82}}], 'items': [{'timestamp': '2019-07-06T21:00:00+08:00', 'update_timestamp': '2019-07-06T21:03:52+08:00', 'readings': {'o3_sub_index': {'west': 9, 'national': 12, 'east': 8, 'central': 12, 'south': 4, 'north': 10}, 'pm10_twenty_four_hourly': {'west': 20, 'national': 27, 'east': 27, 'central': 17, 'south': 23, 'north': 19}, 'pm10_sub_index': {'west': 20, 'national': 27, 'east': 27, 'central': 17, 'south': 23, 'north': 19}, 'co_sub_index': {'west': 5, 'national': 5, 'east': 5, 'central': 4, 'south': 5,

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)