### Chapter 18: Data I/O
* data classes:
    * structured vs unstructured
    * categorical (finite set) vs ordinal (ordered) vs numerical (continuous/discrete)
* should consider: [Blaze](http://blaze.pydata.org/en/latest) for high-level, multi-format API for data I/O

## Imports

In [1]:
from __future__ import print_function

In [2]:
import numpy as np
np.random.seed(0)

In [3]:
import pandas as pd

In [4]:
import csv
import json
import h5py
import tables
import pickle

# python3: import _pickle as cPickle
import _pickle as cPickle

# conda install msgpack-python
import msgpack

# CSV

In [5]:
%%writefile ch18-playerstats-2013-2014.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0

Overwriting ch18-playerstats-2013-2014.csv


In [6]:
%%writefile ch18-playerstats-2013-2014-top30.csv
# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9
4,Tyler Seguin,DAL,C,80,37,47,84,+16,18,11,25,0,0,8,0,294,12.6,19:20,23.4,41.5
5,Corey Perry,ANA,R,81,43,39,82,+32,65,8,18,0,0,9,1,280,15.4,19:28,23.2,36.0
6,Phil Kessel,TOR,R,82,37,43,80,-5,27,8,20,0,0,6,0,305,12.1,20:39,24.5,14.3
7,Taylor Hall,EDM,L,75,27,53,80,-15,44,7,17,0,1,1,1,250,10.8,20:00,25.4,45.7
8,Alex Ovechkin,WSH,L,78,51,28,79,-35,48,24,39,0,1,10,3,386,13.2,20:32,21.8,66.7
9,Joe Pavelski,SJS,C,82,41,38,79,+23,32,16,31,1,2,3,0,225,18.2,19:51,27.1,56.0
10,Jamie Benn,DAL,L,81,34,45,79,+21,64,5,19,1,3,3,1,279,12.2,19:09,25.0,52.8
11,Nicklas Backstrom,WSH,C,82,18,61,79,-20,54,6,44,1,1,1,0,196,9.2,19:48,23.3,50.4
12,Patrick Sharp,CHI,L,82,34,44,78,+13,40,10,25,0,0,3,1,313,10.9,18:53,22.7,54.6
13,Joe Thornton,SJS,C,82,11,65,76,+20,32,2,19,0,1,3,1,122,9.0,18:55,26.3,56.1
14,Erik Karlsson,OTT,D,82,20,54,74,-15,36,5,31,0,0,1,0,257,7.8,27:04,28.6,0.0
15,Evgeni Malkin,PIT,C,60,23,49,72,+10,62,7,30,0,0,3,0,191,12.0,20:03,21.4,48.8
16,Patrick Marleau,SJS,L,82,33,37,70,+0,18,11,23,2,2,4,0,285,11.6,20:31,27.3,52.9
17,Anze Kopitar,LAK,C,82,29,41,70,+34,24,10,23,0,0,9,2,200,14.5,20:53,25.4,53.3
18,Matt Duchene,COL,C,71,23,47,70,+8,19,5,17,0,0,6,1,217,10.6,18:29,22.0,50.3
19,Martin St. Louis,"TBL, NYR",R,81,30,39,69,+13,10,9,21,1,2,5,1,204,14.7,20:56,25.7,40.7
20,Patrick Kane,CHI,R,69,29,40,69,+7,22,10,25,0,0,6,0,227,12.8,19:36,22.9,50.0
21,Blake Wheeler,WPG,R,82,28,41,69,+4,63,8,19,0,0,4,2,225,12.4,18:41,24.0,37.5
22,Kyle Okposo,NYI,R,71,27,42,69,-9,51,5,15,0,0,4,1,195,13.8,20:26,22.2,47.5
23,David Krejci,BOS,C,80,19,50,69,+39,28,3,19,0,0,6,1,169,11.2,19:07,21.3,51.2
24,Chris Kunitz,PIT,L,78,35,33,68,+25,66,13,22,0,0,8,0,218,16.1,19:09,22.2,75.0
25,Jonathan Toews,CHI,C,76,28,40,68,+26,34,5,15,3,5,5,0,193,14.5,20:28,25.9,57.2
26,Thomas Vanek,"BUF, NYI, MTL",L,78,27,41,68,+7,46,8,18,0,0,4,0,248,10.9,19:21,21.6,43.5
27,Jaromir Jagr,NJD,R,82,24,43,67,+16,46,5,17,0,0,6,1,231,10.4,19:09,22.8,0.0
28,John Tavares,NYI,C,59,24,42,66,-6,40,8,25,0,0,4,0,188,12.8,21:14,22.3,49.1
29,Jason Spezza,OTT,C,75,23,43,66,-26,46,9,22,0,0,5,0,223,10.3,18:12,23.8,54.0
30,Jordan Eberle,EDM,R,80,28,37,65,-11,18,7,20,1,1,4,1,200,14.0,19:32,25.4,38.1

Overwriting ch18-playerstats-2013-2014-top30.csv


In [7]:
!head -n 5 ch18-playerstats-2013-2014-top30.csv

# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9


In [8]:
rows = []

In [9]:
# csv.reader

with open("ch18-playerstats-2013-2014.csv") as f:
    csvreader = csv.reader(f, )
    print(type(csvreader))
    for fields in csvreader:
        rows.append(fields)

<class '_csv.reader'>


In [10]:
rows[1][1:6]

['Player', 'Team', 'Pos', 'GP', 'G']

In [11]:
rows[2][1:6]

['Sidney Crosby', 'PIT', 'C', '80', '36']

In [12]:
data = np.random.randn(100, 3)

In [13]:
np.savetxt("ch18-data.csv", data, delimiter=",", header="x, y, z", comments="# Random x, y, z coordinates\n")

In [14]:
!head -n 5 ch18-data.csv

# Random x, y, z coordinates
x, y, z
1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01
2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01
9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01


In [15]:
# use np.loadtxt to read data back into NumPy arrays
data_load = np.loadtxt("ch18-data.csv", skiprows=2, delimiter=",")

In [16]:
data_load[1,:]

array([ 2.2408932 ,  1.86755799, -0.97727788])

In [17]:
data_load.dtype

dtype('float64')

In [18]:
(data == data_load).all()

True

In [19]:
# this is what happens when you don't set a dtype
# reading CSV files with non-numerical values will barf.
#np.loadtxt("ch18-playerstats-2013-2014.csv", skiprows=2, delimiter=",")

In [20]:
data = np.loadtxt("ch18-playerstats-2013-2014.csv", skiprows=2, delimiter=",", dtype=bytes)

In [21]:
data[0][1:6]

array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], 
      dtype='|S13')

In [22]:
# read in only selected columns with usecols=<arg>
np.loadtxt("ch18-playerstats-2013-2014.csv", skiprows=2, delimiter=",", usecols=[6,7,8])

array([[  68.,  104.,   18.],
       [  56.,   87.,   28.],
       [  58.,   86.,    7.],
       [  47.,   84.,   16.],
       [  39.,   82.,   32.]])

In [23]:
# another method: Pandas read_csv()
df = pd.read_csv("ch18-playerstats-2013-2014.csv", skiprows=1)

In [24]:
df = df.set_index("Rank")

In [25]:
df[["Player", "GP", "G", "A", "P"]]

Unnamed: 0_level_0,Player,GP,G,A,P
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84
5,Corey Perry,81,43,39,82


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 20 columns):
Player      5 non-null object
Team        5 non-null object
Pos         5 non-null object
GP          5 non-null int64
G           5 non-null int64
A           5 non-null int64
P           5 non-null int64
+/-         5 non-null int64
PIM         5 non-null int64
PPG         5 non-null int64
PPP         5 non-null int64
SHG         5 non-null int64
SHP         5 non-null int64
GW          5 non-null int64
OT          5 non-null int64
S           5 non-null int64
S%          5 non-null float64
TOI/GP      5 non-null object
Shift/GP    5 non-null float64
FO%         5 non-null float64
dtypes: float64(3), int64(13), object(4)
memory usage: 840.0+ bytes


In [27]:
df[["Player", "GP", "G", "A", "P"]].to_csv("ch18-playerstats-2013-2014-subset.csv")

In [28]:
!head -n 5 ch18-playerstats-2013-2014-subset.csv

Rank,Player,GP,G,A,P
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84


### h5py
* used for numerical data store
* hierarchical format - orgs datasets within files: "groups" and "datasets"
* groups & datasets can contain "attributes" (metadata)
* Python libraries: h5py & PyTables

In [29]:
import h5py

In [30]:
# mode = "w", "r", "w-", "r+", "a"

In [31]:
f = h5py.File("ch18-data.h5", "w")
f.mode

'r+'

In [32]:
f.flush()
f.close()

In [33]:
# File object creates both file handle and a "root group" object.
# group name accessible via 'name'. root is '/'

f = h5py.File("ch18-data.h5", "w")
f.name

'/'

In [34]:
# create hierarchical subgroups
grp1      = f.create_group("experiment1")
grp2_meas = f.create_group("experiment2/measurement")
grp2_sim  = f.create_group("experiment2/simulation")

grp1.name, grp2_meas.name, grp2_sim.name

('/experiment1', '/experiment2/measurement', '/experiment2/simulation')

In [35]:
# group access
f["/experiment1"]

<HDF5 group "/experiment1" (0 members)>

In [36]:
f["/experiment2/simulation"]

<HDF5 group "/experiment2/simulation" (0 members)>

In [37]:
grp_expr2 = f["/experiment2"]

In [38]:
grp_expr2['simulation']

<HDF5 group "/experiment2/simulation" (0 members)>

In [39]:
# keys = names of subgroups & datasets within a group
list(f.keys())

['experiment1', 'experiment2']

In [40]:
# items = tuples of (name, value) for each entity in each group
list(f.items())

[('experiment1', <HDF5 group "/experiment1" (0 members)>),
 ('experiment2', <HDF5 group "/experiment2" (2 members)>)]

In [41]:
# traverse group hierarchy
f.visit(lambda x: print(x))

experiment1
experiment2
experiment2/measurement
experiment2/simulation


In [42]:
# traverse group hierarchy with item & item name accessible in arg
f.visititems(
    lambda name, 
    value: print(name, value))

experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)>
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>


In [43]:
# membership testing
"experiment1" in f

True

In [44]:
"simulation" in f["experiment2"]

True

In [45]:
"experiment3" in f

False

In [46]:
f.flush()

In [47]:
# h5ls = command-line tool for viewing HDF5 contents
!h5ls -r ch18-data.h5

/                        Group
/experiment1             Group
/experiment2             Group
/experiment2/measurement Group
/experiment2/simulation  Group


### HDF5 datasets

In [48]:
data1 = np.arange(10)
data2 = np.random.randn(100, 100)

In [49]:
f["array1"]                         = data1
f["/experiment2/measurement/meas1"] = data2

In [50]:
f.visititems(lambda name, value: print(name, value))

array1 <HDF5 dataset "array1": shape (10,), type "<i8">
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)>
experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8">
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>


In [51]:
# to retrieve array1 dataset (in root group)
ds = f["array1"]
ds # is a Dataset object, not a NumPy array

<HDF5 dataset "array1": shape (10,), type "<i8">

In [52]:
ds.name, ds.dtype, ds.shape, ds.len()

('/array1', dtype('int64'), (10,), 10)

In [53]:
ds.value

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [54]:
# go deeper into hierarchy
ds = f["/experiment2/measurement/meas1"]
ds

<HDF5 dataset "meas1": shape (100, 100), type "<f8">

In [55]:
ds.dtype, ds.shape

(dtype('float64'), (100, 100))

In [56]:
data_full = ds[...] # use value attribute to retrieve data
data_full           # ... is alternative syntax

array([[-1.30652685,  1.65813068, -0.11816405, ...,  1.14110187,
         1.46657872,  0.85255194],
       [-0.59865394, -1.11589699,  0.76666318, ..., -0.51423397,
        -1.01804188, -0.07785476],
       [ 0.38273243, -0.03424228,  1.09634685, ..., -0.21673147,
        -0.9301565 , -0.17858909],
       ..., 
       [-0.20211703, -0.833231  ,  1.73360025, ...,  0.77025427,
        -0.08612658, -0.85766795],
       [ 0.6391736 , -0.24720034,  0.23337957, ...,  0.17974832,
         0.26792302,  0.7701867 ],
       [ 1.31951239, -0.42585313,  0.09323029, ..., -0.51270866,
        -0.44602375,  1.89001412]])

In [57]:
type(data_full), data_full.shape

(numpy.ndarray, (100, 100))

In [58]:
# retrieve only first column from ds 
data_col = ds[:, 0]
data_col.shape

(100,)

In [59]:
# strided index support
ds[10:20:3, 10:20:3]

array([[ 0.60270766, -0.34804638, -0.813596  , -1.29737966],
       [ 0.91320192, -1.06343294,  0.22734595,  0.52759738],
       [ 1.25774422, -0.32775492,  1.4849256 ,  0.28005786],
       [-0.84907287, -0.30000358,  1.79691852, -0.19871506]])

In [60]:
ds[[1,2,3], :].shape

(3, 100)

In [61]:
# boolean masking support
mask = ds[:, 0] > 2.0

In [62]:
mask.shape, mask.dtype

((100,), dtype('bool'))

In [63]:
ds[mask, 0]

array([ 2.04253623,  2.1041854 ,  2.05689385])

In [64]:
ds[mask, :5]

array([[ 2.04253623, -0.91946118,  0.11467003, -0.1374237 ,  1.36552692],
       [ 2.1041854 ,  0.22725706, -1.1291663 , -0.28133197, -0.7394167 ],
       [ 2.05689385,  0.18041971, -0.06670925, -0.02835398,  0.48480475]])

In [65]:
# create empty data sets, assign and update datasets

In [66]:
ds = f.create_dataset(
    "array2", 
    data=np.random.randint(10, size=10))

In [67]:
ds

<HDF5 dataset "array2": shape (10,), type "<i8">

In [68]:
ds.value

array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])

In [69]:
ds = f.create_dataset(
    "/experiment2/simulation/data1", 
    shape=(5, 5), 
    fillvalue=-1)
ds

<HDF5 dataset "data1": shape (5, 5), type "<f4">

In [70]:
ds.value

array([[-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.]], dtype=float32)

In [71]:
ds = f.create_dataset(
    "/experiment1/simulation/data1", 
    shape=(5000, 5000, 5000),
    fillvalue=0, 
    compression='gzip') # HDF5 = smart compression
ds

<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">

In [72]:
ds[:, 0, 0]  = np.random.rand(5000)
ds[1, :, 0] += np.random.rand(5000)

In [73]:
ds[:2, :5, 0]

array([[ 0.69393438,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.4819994 ,  0.01639538,  0.54387355,  0.11130908,  0.99287713]], dtype=float32)

In [74]:
ds.fillvalue

0.0

In [75]:
f["experiment1"].visititems(
    lambda name, 
    value: print(name, value))

simulation <HDF5 group "/experiment1/simulation" (1 members)>
simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">


In [76]:
float(np.prod(ds.shape) * ds[0,0,0].nbytes) / (1024**3)  # Gb

465.66128730773926

In [77]:
f.flush()
f.filename

'ch18-data.h5'

In [78]:
!ls -lh ch18-data.h5

-rw-rw-r-- 1 bjpcjp bjpcjp 357K May 11 20:50 ch18-data.h5


In [79]:
del f["/experiment1/simulation/data1"]

In [80]:
# data1 should now be gone
f["experiment1"].visititems(
    lambda name, 
    value: print(name, value))

simulation <HDF5 group "/experiment1/simulation" (0 members)>


In [81]:
f.close()

### HDF5 Atributes

In [82]:
f = h5py.File("ch18-data.h5")
f.attrs

<Attributes of HDF5 object at 139725595074920>

In [83]:
f.attrs["desc"] = "Result sets from experiments and simulations"

In [84]:
f["experiment1"].attrs["date"] = "2015-1-1"
f["experiment2"].attrs["date"] = "2015-1-2"

f["experiment2/simulation/data1"].attrs["k"] = 1.5
f["experiment2/simulation/data1"].attrs["T"] = 1000

In [85]:
list(f["experiment1"].attrs.keys())

['date']

In [86]:
list(f["experiment2/simulation/data1"].attrs.items())

[('k', 1.5), ('T', 1000)]

In [87]:
"T" in f["experiment2/simulation/data1"].attrs

True

In [88]:
del f["experiment2/simulation/data1"].attrs["T"]

In [89]:
"T" in f["experiment2/simulation/data1"].attrs

False

In [90]:
f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])

In [91]:
f["experiment2/simulation/data1"].attrs["t"]

array([1, 2, 3])

In [92]:
f.close()

## pytables

In [93]:
df = pd.read_csv(
    "ch18-playerstats-2013-2014-top30.csv", skiprows=1)

df = df.set_index("Rank")

In [94]:
df[["Player", "Pos", "GP", "P", "G", "A", "S%", "Shift/GP"]].head(5)

Unnamed: 0_level_0,Player,Pos,GP,P,G,A,S%,Shift/GP
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Sidney Crosby,C,80,104,36,68,13.9,24.0
2,Ryan Getzlaf,C,77,87,31,56,15.2,25.2
3,Claude Giroux,C,82,86,28,58,12.6,25.1
4,Tyler Seguin,C,80,84,37,47,12.6,23.4
5,Corey Perry,R,81,82,43,39,15.4,23.2


In [95]:
f = tables.open_file(
    "ch18-playerstats-2013-2014.h5", mode="w")

In [96]:
grp = f.create_group(
    "/", 
    "season_2013_2014", 
    title="NHL player statistics for the 2013/2014 season")

grp

/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season'
  children := []

In [97]:
grp

/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season'
  children := []

In [98]:
f.root

/ (RootGroup) ''
  children := ['season_2013_2014' (Group)]

In [99]:
class PlayerStat(tables.IsDescription):
    player = tables.StringCol(20, dflt="")
    position = tables.StringCol(1, dflt="C")
    games_played = tables.UInt8Col(dflt=0)
    points = tables.UInt16Col(dflt=0)
    goals = tables.UInt16Col(dflt=0)
    assists = tables.UInt16Col(dflt=0)
    shooting_percentage = tables.Float64Col(dflt=0.0)
    shifts_per_game_played = tables.Float64Col(dflt=0.0) 

In [100]:
top30_table = f.create_table(
    grp, 'top30', PlayerStat, "Top 30 point leaders")

In [101]:
playerstat = top30_table.row

In [102]:
type(playerstat)

tables.tableextension.Row

In [103]:
for index, row_series in df.iterrows():
    playerstat["player"] = row_series["Player"]    
    playerstat["position"] = row_series["Pos"]    
    playerstat["games_played"] = row_series["GP"]    
    playerstat["points"] = row_series["P"]    
    playerstat["goals"] = row_series["G"]
    playerstat["assists"] = row_series["A"] 
    playerstat["shooting_percentage"] = row_series["S%"]
    playerstat["shifts_per_game_played"] = row_series["Shift/GP"]
    playerstat.append()

In [104]:
top30_table.flush()

In [105]:
top30_table.cols.player[:5]

array([b'Sidney Crosby', b'Ryan Getzlaf', b'Claude Giroux',
       b'Tyler Seguin', b'Corey Perry'], 
      dtype='|S20')

In [106]:
top30_table.cols.points[:5]

array([104,  87,  86,  84,  82], dtype=uint16)

In [107]:
def print_playerstat(row):
    print("%20s\t%s\t%s\t%s" %
          (row["player"].decode('UTF-8'), row["points"], row["goals"], row["assists"]))

In [108]:
for row in top30_table.iterrows():
    print_playerstat(row)

       Sidney Crosby	104	36	68
        Ryan Getzlaf	87	31	56
       Claude Giroux	86	28	58
        Tyler Seguin	84	37	47
         Corey Perry	82	43	39
         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65
       Erik Karlsson	74	20	54
       Evgeni Malkin	72	23	49
     Patrick Marleau	70	33	37
        Anze Kopitar	70	29	41
        Matt Duchene	70	23	47
    Martin St. Louis	69	30	39
        Patrick Kane	69	29	40
       Blake Wheeler	69	28	41
         Kyle Okposo	69	27	42
        David Krejci	69	19	50
        Chris Kunitz	68	35	33
      Jonathan Toews	68	28	40
        Thomas Vanek	68	27	41
        Jaromir Jagr	67	24	43
        John Tavares	66	24	42
        Jason Spezza	66	23	43
       Jordan Eberle	65	28	37


In [109]:
for row in top30_table.where("(points > 75) & (points <= 80)"):
    print_playerstat(row)

         Phil Kessel	80	37	43
         Taylor Hall	80	27	53
       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38
          Jamie Benn	79	34	45
   Nicklas Backstrom	79	18	61
       Patrick Sharp	78	34	44
        Joe Thornton	76	11	65


In [110]:
for row in top30_table.where("(goals > 40) & (points < 80)"):
    print_playerstat(row)

       Alex Ovechkin	79	51	28
        Joe Pavelski	79	41	38


In [111]:
f

File(filename=ch18-playerstats-2013-2014.h5, title='', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, bitshuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/season_2013_2014 (Group) 'NHL player statistics for the 2013/2014 season'
/season_2013_2014/top30 (Table(30,)) 'Top 30 point leaders'
  description := {
  "assists": UInt16Col(shape=(), dflt=0, pos=0),
  "games_played": UInt8Col(shape=(), dflt=0, pos=1),
  "goals": UInt16Col(shape=(), dflt=0, pos=2),
  "player": StringCol(itemsize=20, shape=(), dflt=b'', pos=3),
  "points": UInt16Col(shape=(), dflt=0, pos=4),
  "position": StringCol(itemsize=1, shape=(), dflt=b'C', pos=5),
  "shifts_per_game_played": Float64Col(shape=(), dflt=0.0, pos=6),
  "shooting_percentage": Float64Col(shape=(), dflt=0.0, pos=7)}
  byteorder := 'little'
  chunkshape := (1489,)

In [112]:
f.flush()

In [113]:
f.close()

In [114]:
!h5ls -rv playerstats-2013-2014.h5

playerstats-2013-2014.h5: unable to open file


## Pandas hdfstore

In [115]:
import pandas as pd

In [116]:
store = pd.HDFStore('store.h5')

In [117]:
df = pd.DataFrame(np.random.rand(5,5))

In [118]:
store["df1"] = df

In [119]:
df = pd.read_csv("playerstats-2013-2014-top30.csv", skiprows=1)

In [120]:
store["df2"] = df

In [121]:
store.keys()

['/df1', '/df2']

In [122]:
'df2' in store

True

In [123]:
df = store["df1"]

In [124]:
store.root

/ (RootGroup) ''
  children := ['df2' (Group), 'df1' (Group)]

In [125]:
store.close()

In [126]:
f = h5py.File("store.h5")

In [127]:
f.visititems(lambda x, y: print(x, "\t" * int(3 - len(str(x))//8), y))

df1 			 <HDF5 group "/df1" (4 members)>
df1/axis0 		 <HDF5 dataset "axis0": shape (5,), type "<i8">
df1/axis1 		 <HDF5 dataset "axis1": shape (5,), type "<i8">
df1/block0_items 	 <HDF5 dataset "block0_items": shape (5,), type "<i8">
df1/block0_values 	 <HDF5 dataset "block0_values": shape (5, 5), type "<f8">
df2 			 <HDF5 group "/df2" (8 members)>
df2/axis0 		 <HDF5 dataset "axis0": shape (21,), type "|S8">
df2/axis1 		 <HDF5 dataset "axis1": shape (30,), type "<i8">
df2/block0_items 	 <HDF5 dataset "block0_items": shape (3,), type "|S8">
df2/block0_values 	 <HDF5 dataset "block0_values": shape (30, 3), type "<f8">
df2/block1_items 	 <HDF5 dataset "block1_items": shape (14,), type "|S4">
df2/block1_values 	 <HDF5 dataset "block1_values": shape (30, 14), type "<i8">
df2/block2_items 	 <HDF5 dataset "block2_items": shape (4,), type "|S6">
df2/block2_values 	 <HDF5 dataset "block2_values": shape (1,), type "|O">


In [128]:
f["/df2/block0_items"].value          

array([b'S%', b'Shift/GP', b'FO%'], 
      dtype='|S8')

In [129]:
f["/df2/block0_values"][:3]

array([[ 13.9,  24. ,  52.5],
       [ 15.2,  25.2,  49. ],
       [ 12.6,  25.1,  52.9]])

In [130]:
f["/df2/block1_items"].value  

array([b'Rank', b'GP', b'G', b'A', b'P', b'+/-', b'PIM', b'PPG', b'PPP',
       b'SHG', b'SHP', b'GW', b'OT', b'S'], 
      dtype='|S4')

In [131]:
f["/df2/block1_values"][:3, :5]

array([[  1,  80,  36,  68, 104],
       [  2,  77,  31,  56,  87],
       [  3,  82,  28,  58,  86]])

# JSON

In [132]:
data = ["string", 1.0, 2, None]

In [133]:
data_json = json.dumps(data)

In [134]:
data_json

'["string", 1.0, 2, null]'

In [135]:
data2 = json.loads(data_json)

In [136]:
data

['string', 1.0, 2, None]

In [137]:
data[0]

'string'

In [138]:
data = {"one": 1, "two": 2.0, "three": "three"}

In [139]:
data_json = json.dumps(data)

In [140]:
print(data_json)

{"two": 2.0, "one": 1, "three": "three"}


In [141]:
data = json.loads(data_json)

In [142]:
data["two"]

2.0

In [143]:
data["three"]

'three'

In [144]:
data = {"one": [1], 
        "two": [1, 2], 
        "three": [1, 2, 3]}

In [145]:
data_json = json.dumps(data, indent=True)

In [146]:
print(data_json)

{
 "two": [
  1,
  2
 ],
 "one": [
  1
 ],
 "three": [
  1,
  2,
  3
 ]
}


In [147]:
data = {"one": [1], 
        "two": {"one": 1, "two": 2}, 
        "three": [(1,), (1, 2), (1, 2, 3)],
        "four": "a text string"}

In [148]:
with open("data.json", "w") as f:
    json.dump(data, f)

In [149]:
!cat data.json

{"two": {"two": 2, "one": 1}, "four": "a text string", "one": [1], "three": [[1], [1, 2], [1, 2, 3]]}

In [150]:
with open("data.json", "r") as f:
    data_from_file = json.load(f)

In [151]:
data_from_file["two"]

{'one': 1, 'two': 2}

In [152]:
data_from_file["three"]

[[1], [1, 2], [1, 2, 3]]

In [153]:
!head -n 20 tokyo-metro.json

{
    "C": {
        "color": "#149848", 
        "transfers": [
            [
                "C3", 
                "F15"
            ], 
            [
                "C4", 
                "Z2"
            ], 
            [
                "C4", 
                "G2"
            ], 
            [
                "C7", 
                "M14"
            ], 


In [154]:
!wc tokyo-metro.json

 1471  1508 27638 tokyo-metro.json


In [155]:
with open("tokyo-metro.json", "r") as f:
    data = json.load(f)

In [156]:
data.keys()

dict_keys(['H', 'C', 'T', 'G', 'M', 'F', 'N', 'Y', 'Z'])

In [157]:
data["C"].keys()

dict_keys(['travel_times', 'transfers', 'color'])

In [158]:
data["C"]["color"]

'#149848'

In [159]:
data["C"]["transfers"]

[['C3', 'F15'],
 ['C4', 'Z2'],
 ['C4', 'G2'],
 ['C7', 'M14'],
 ['C7', 'N6'],
 ['C7', 'G6'],
 ['C8', 'M15'],
 ['C8', 'H6'],
 ['C9', 'H7'],
 ['C9', 'Y18'],
 ['C11', 'T9'],
 ['C11', 'M18'],
 ['C11', 'Z8'],
 ['C12', 'M19'],
 ['C18', 'H21']]

In [160]:
[(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1]

[('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]

In [161]:
data

{'C': {'color': '#149848',
  'transfers': [['C3', 'F15'],
   ['C4', 'Z2'],
   ['C4', 'G2'],
   ['C7', 'M14'],
   ['C7', 'N6'],
   ['C7', 'G6'],
   ['C8', 'M15'],
   ['C8', 'H6'],
   ['C9', 'H7'],
   ['C9', 'Y18'],
   ['C11', 'T9'],
   ['C11', 'M18'],
   ['C11', 'Z8'],
   ['C12', 'M19'],
   ['C18', 'H21']],
  'travel_times': [['C1', 'C2', 2],
   ['C2', 'C3', 2],
   ['C3', 'C4', 1],
   ['C4', 'C5', 2],
   ['C5', 'C6', 2],
   ['C6', 'C7', 2],
   ['C7', 'C8', 1],
   ['C8', 'C9', 3],
   ['C9', 'C10', 1],
   ['C10', 'C11', 2],
   ['C11', 'C12', 2],
   ['C12', 'C13', 2],
   ['C13', 'C14', 2],
   ['C14', 'C15', 2],
   ['C15', 'C16', 2],
   ['C16', 'C17', 3],
   ['C17', 'C18', 3],
   ['C18', 'C19', 3]]},
 'F': {'color': '#b96528',
  'transfers': [['F1', 'Y1'],
   ['F2', 'Y2'],
   ['F3', 'Y3'],
   ['F4', 'Y4'],
   ['F5', 'Y5'],
   ['F6', 'Y6'],
   ['F7', 'Y7'],
   ['F8', 'Y8'],
   ['F9', 'Y9'],
   ['F9', 'M25'],
   ['F13', 'M9'],
   ['F15', 'C3'],
   ['F16', 'Z1'],
   ['F16', 'G1']],
  'travel_t

In [162]:
!ls -lh tokyo-metro.json

-rw-rw-r-- 1 bjpcjp bjpcjp 27K Aug 28  2016 tokyo-metro.json


In [163]:
data_pack = msgpack.packb(data)

In [164]:
del data

In [165]:
type(data_pack)

bytes

In [166]:
len(data_pack)

3021

In [167]:
with open("tokyo-metro.msgpack", "wb") as f:
    f.write(data_pack)

In [168]:
!ls -lh tokyo-metro.msgpack

-rw-rw-r-- 1 bjpcjp bjpcjp 3.0K May 11 20:50 tokyo-metro.msgpack


In [169]:
with open("tokyo-metro.msgpack", "rb") as f:
    data_msgpack = f.read()
    data = msgpack.unpackb(data_msgpack)

In [170]:
list(data.keys())

[b'F', b'C', b'T', b'G', b'M', b'H', b'N', b'Y', b'Z']

In [171]:
with open("tokyo-metro.pickle", "wb") as f:
    cPickle.dump(data, f)

In [172]:
del data

In [173]:
!ls -lh tokyo-metro.pickle

-rw-rw-r-- 1 bjpcjp bjpcjp 8.6K May 11 20:51 tokyo-metro.pickle


In [174]:
with open("tokyo-metro.pickle", "rb") as f:
    data = pickle.load(f)

In [175]:
data.keys()

dict_keys([b'H', b'C', b'T', b'G', b'M', b'F', b'N', b'Y', b'Z'])