# 6.2 Binary Data Formats

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
frame = pd.read_csv('ex1.csv')

In [3]:
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
frame.to_pickle('frame_pickle')

In [5]:
pd.read_pickle('frame_pickle')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


Some other storage formatfor pandas and numpy data included:
- bcolz
- Feather

---

## Using HDF5 Format

HDF5 is a well-regarded file format intended for storing large quantities of scientific
array data. HD: Hierachical data format. Good for working with very large datasets.

In [6]:
frame = pd.DataFrame({'a' : np.random.randn(100)})

In [8]:
store = pd.HDFStore('mydata.h5') # like a storage right

In [9]:
type(store)

pandas.io.pytables.HDFStore

In [10]:
store['obj1'] = frame # we assign storage with a key

In [11]:
type(store)

pandas.io.pytables.HDFStore

In [12]:
store['obj1_col'] = frame['a']

In [14]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5

In [17]:
store['obj1'].head() # we can retireve the data like dict accessing

Unnamed: 0,a
0,1.213184
1,-1.392793
2,-0.728247
3,-0.638656
4,0.292961


HDFStore supports two storage schemas, 'fixed' and 'table'. The latter is generally
slower, but it supports query operations using a special syntax:

In [18]:
store.put('obj2', frame, format='table')

In [19]:
store.select('obj2', where=['index >= 10 and index <= 15'])

Unnamed: 0,a
10,-0.65574
11,0.287938
12,-0.962428
13,-0.286368
14,0.55892
15,0.988492


In [20]:
frame.to_hdf('mydata.h5', 'obj3', format='table') 
# short version of store['obj3'] = frame
# short version of store.put('obj3', format='table')

In [21]:
store.close()

In [22]:
pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])
# pandas version of store.select method

Unnamed: 0,a
0,1.213184
1,-1.392793
2,-0.728247
3,-0.638656
4,0.292961


___

## Reading Microsoft Excel Files

In [25]:
# pd.ExcelFile returns ExcelFile object
# pd.read_excel return dataframe

In [28]:
frame = pd.read_csv('ex1.csv')

In [29]:
frame

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [30]:
write = pd.ExcelWriter('ex2.xlsx')

In [31]:
frame.to_excel(write, 'Sheet1')

In [32]:
write.save()

In [33]:
# frame.to_excel('ex2.xlsx', 'Sheet1') 