# Input/Output

## Pandas, Hdf5

-  Pandas (high-level data structures) https://pandas.pydata.org/pandas-docs/stable/tutorials.html
-  HDF5 (data model, library, and file format for storing and managing data) https://support.hdfgroup.org/HDF5/

## Pandas

In [1]:
# % reset
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
%matplotlib inline

In [2]:
file = 'data/highscore.csv'
df = pd.read_csv(file)
print df.keys()
print df

Index([u'Names', u'Highscore'], dtype='object')
   Names  Highscore
0    Mel          8
1   Jack          5
2  David          3
3  Peter          6
4  Maria          5
5   Ryan          9


In [3]:
print'Max', df['Highscore'].max()
print'Min', df['Highscore'].min()

Max 9
Min 3


In [4]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
print df2
print df2.describe()


   A          B  C  D      E    F
0  1 2013-01-02  1  3   test  foo
1  1 2013-01-02  1  3  train  foo
2  1 2013-01-02  1  3   test  foo
3  1 2013-01-02  1  3  train  foo
       A  C  D
count  4  4  4
mean   1  1  3
std    0  0  0
min    1  1  3
25%    1  1  3
50%    1  1  3
75%    1  1  3
max    1  1  3


In [5]:
df2.to_csv('data/foo.csv')

## HDF5

In [6]:
import h5py
from timeit import timeit
from time import time, clock # time for real time, clock for cpu time
# https://www.youtube.com/watch?v=OqoLv1TCR9w&list=PLea0WJq13cnB_ORdGzEkPlZEN20TSt6Lx&index=5
# http://docs.h5py.org/en/latest/quick.html

In [7]:
matrix1 = np.random.random(size = (1000,1000))

start = time()

hdf = h5py.File('data/hdf5_data.h5', 'w')
hdf.create_dataset('dataset1', data=matrix1)
hdf.close()

t = (time()-start)* 1000
print "elaped time hdf5 : %g ms" % t

% timeit np.savetxt('data/data.txt', matrix1, fmt='%20.16f')
% timeit np.savez('data/data', d=matrix1) 


elaped time hdf5 : 9.95398 ms
1 loop, best of 3: 536 ms per loop
The slowest run took 5.61 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 61.2 ms per loop


## Reading the data

In [8]:
start = time()
hdf = h5py.File('data/hdf5_data.h5','r')
ls = list(hdf.keys())
print "List of datasets in this file: \n", ls
data = hdf.get('dataset1')
dataset1 = np.array(data)
print 'Shape of dataset1', dataset1.shape
t = (time()-start)*1000
print 'time: %g ms' % t

List of datasets in this file: 
[u'dataset1']
Shape of dataset1 (1000, 1000)
time: 7.99298 ms


In [9]:
%timeit data = np.genfromtxt('data/data.txt')
print data.shape

IOError: data/data.txt not found.

## Creating groups in HDF5

In [None]:
import numpy as np
import h5py
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (1000,1000))
matrix3 = np.random.random(size = (1000,1000))
matrix4 = np.random.random(size = (1000,1000))

with h5py.File('data/hdf5_groups.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1)
    G1.create_dataset('dataset4', data = matrix4)
 
    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3)
    
    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2)

## Reading Groups in HDF5

In [None]:
with h5py.File('data/hdf5_groups.h5','r') as hdf:
    base_items = list(hdf.items())
    print 'Items in the base directory:', base_items
    G2 = hdf.get('Group2')
    G2_items = list(G2.items())
    print 'Items in Group2:', G2_items
    G21 = G2.get('/Group2/SubGroup1')
    G21_items = list(G21.items())
    print 'Items in Group21:', G21_items
    dataset3 = np.array(G21.get('dataset4'))
    print dataset3.shape


## Compressing data in HDF5

In [None]:
import numpy as np
import h5py
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (1000,1000))
matrix3 = np.random.random(size = (1000,1000))
matrix4 = np.random.random(size = (1000,1000))

with h5py.File('data/hdf5_groups_compressed.h5', 'w') as hdf:
    G1 = hdf.create_group('Group1')
    G1.create_dataset('dataset1', data = matrix1, compression="gzip", compression_opts=9)
    G1.create_dataset('dataset4', data = matrix4, compression="gzip", compression_opts=9)
 
    G21 = hdf.create_group('Group2/SubGroup1')
    G21.create_dataset('dataset3', data = matrix3, compression="gzip", compression_opts=9)
    
    G22 = hdf.create_group('Group2/SubGroup2')
    G22.create_dataset('dataset2', data = matrix2, compression="gzip", compression_opts=9)

## Attribute

In [None]:
import numpy as np
import h5py
matrix1 = np.random.random(size = (1000,1000))
matrix2 = np.random.random(size = (10000,100))
# Create the HDF5 file
hdf = h5py.File('test.h5', 'w')

# Create the datasets
dataset1 = hdf.create_dataset('dataset1', data=matrix1)
dataset2 = hdf.create_dataset('dataset2', data=matrix2)

# Set attributes
dataset1.attrs['CLASS'] = 'DATA_MATRIX'
dataset1.attrs['VERSION'] = '1.1'
hdf.close()

# Read the HDF5 file
hdf = h5py.File('test.h5', 'r')
ls = list(hdf.keys())
print 'List of datasets in this file: ', ls
data = hdf.get('dataset1')
dataset1 = np.asarray(data)
print 'Shape of dataset1: ', dataset1.shape
#read the attributes
k = list(data.attrs.keys())
v = list(data.attrs.values())
print k[0], k[1]
print v[0], v[1]
print(data.attrs.keys()[0])
hdf.close()

## Create hdf5 with pandas

In [None]:
import pandas as pd
# creates (or opens in append mode) an hdf5 file
hdf = pd.HDFStore('data/hdf5_pandas.h5')
data = {
         "city": ["Tripoli", "Sydney", "Tripoli", "Rome", "Rome", "Tripoli","Rome", "Sydney", "Sydney"],
         "rank": ["1st", "2nd", "1st", "2nd", "1st", "2nd","1st", "2nd", "1st"], 
         "score1": [44, 48, 39, 41, 38, 44, 34, 54, 61],
         "score2": [67, 63, 55, 70, 64, 77, 45, 66, 72]
        }
        
df2 = pd.DataFrame(data, columns = ['city', 'rank','score1','score2'])
hdf.put('DF2Key', df2,format='table', data_columns=True)
hdf.close() # close the hdf5 file


In [None]:
df2