In [1]:
import h5py
from faker import Faker
import random
import numpy as np
import pandas as pd
import sys

#### Inputs

CSV and H5 file names

In [2]:
csv_filename = 'build/hetero1.csv'
h5_filename = 'build/hetero1.h5'

Definition of the compound datatype

In [4]:
dt = np.dtype([('name','S20'),
                   ('city','S20'),
                   ('x','f8'),
                   ('y','f8'),
                   ('z','f8')])

### Task 0: Create a random CSV file

Fake data generator

In [4]:
def generate_fake_data(num_records):

    
    fake = Faker()

    data = np.zeros((num_records,), dtype=dt)
    for i in range(num_records):
        name = fake.name()
        city = fake.city()
        #date_time = fake.date_time_between(start_date="-30d", end_date="now", tzinfo=None)
        x = random.uniform(0.0, 100.0)
        y = random.uniform(0.0, 100.0)
        z = random.uniform(0.0, 100.0)
        data[i] = (name, city, x, y, z)

    df = pd.DataFrame(data, columns=['name', 'city', 'x', 'y', 'z'])
    return df

In [16]:
df_ = generate_fake_data(100000)

Save to CSV

In [17]:

df_.to_csv(csv_filename, index=False)

### Task 1:  Convert CSV to HDF5 using NumPy Structured Array

1. Read the CSV file and store the contents into a numpy structured array

In [4]:
np_data = np.genfromtxt(csv_filename,delimiter=',',dtype=dt,names=True)

2) Open a new h5 file

In [5]:
h5f = h5py.File(h5_filename,'w')

3) Create a dataset

In [6]:
shape = np_data.shape
dset = h5f.create_dataset('table',shape,dtype=dt)

3) Store the numpy data into the dataset and close the h5 file

In [7]:
dset[:] = np_data


In [8]:
h5f.close()

4. Compare the sizes of the csv and h5 file

In [9]:
!ls -lh build/hetero1*

-rw-r--r-- 1 vscode vscode 8.4M Jan  4 09:53 build/hetero1.csv
-rw-r--r-- 1 vscode vscode 6.2M Jan  4 10:05 build/hetero1.h5


### Task 2: Retrieve and explore the contents of the HDF5 file

1. Open the H5 file

In [13]:
f = h5py.File(h5_filename)

2. Explore the top level items in the H5 file

In [10]:
list(f.items())

[('table', <HDF5 dataset "table": shape (100000,), type "|V64">)]

3. Verity the data type of the table dataset

In [11]:
f['table'].dtype

dtype([('name', 'S20'), ('city', 'S20'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])

4. Retrieve the contents of the table dataset as a numpy array

In [12]:
f['table'][:]

array([(b"b'Joseph Potts'", b"b'North Jasonborough", 56.23321216, 39.60155857, 36.03870382),
       (b"b'Cassandra Jones'", b"b'Kingborough'", 89.79316277, 61.99607167, 46.27598127),
       (b"b'Debra Ellis'", b"b'New Kathrynborough", 26.24299829, 88.76285726, 75.47795578),
       ...,
       (b"b'Robert Simmons'", b"b'North Chelsey'", 20.93658339, 99.45892975, 98.33742394),
       (b"b'Sarah Daniel'", b"b'Staceyburgh'", 10.49540788,  2.69513113, 75.92907275),
       (b"b'Jessica Gonzalez'", b"b'Franklintown'", 93.01591599, 68.14022661, 82.09163187)],
      dtype=[('name', 'S20'), ('city', 'S20'), ('x', '<f8'), ('y', '<f8'), ('z', '<f8')])

In [18]:
f.close()

### Task 3: Time CSV and HDF5 data retrieval

**CSV**

Time to read CSV and load the data into a numpy array

In [29]:
time np_data = np.genfromtxt(csv_filename,delimiter=',',dtype=dt,names=True)


CPU times: user 268 ms, sys: 60.9 ms, total: 329 ms
Wall time: 325 ms


Time to access the slice

In [30]:
time x = np_data[1000:5000]


CPU times: user 108 µs, sys: 83 µs, total: 191 µs
Wall time: 195 µs


Memory consumption: Size of the numpy array in memory

In [36]:
print("size of numpy array (np_array): {} Bytes".format(np_data.nbytes))
#print("System size of numpy array (np_array): {} Bytes".format(sys.getsizeof(np_data)))
#print("size of sub array (x): {} Bytes".format(x.nbytes))
#print("System size of sub array (x): {} Bytes".format(sys.getsizeof(x)))

size of numpy array (np_array): 6400000 Bytes


**HDf5**

Time to open the h5 file

In [37]:
time f = h5py.File(h5_filename)

CPU times: user 233 µs, sys: 171 µs, total: 404 µs
Wall time: 404 µs


Time to access the slice

In [38]:
time x1 = f['table'][1000:5000]

CPU times: user 1.69 ms, sys: 1.24 ms, total: 2.93 ms
Wall time: 2.63 ms


Memory consumption: Size of the numpy array in memory

In [41]:
print("size of sub array (x): {} Bytes".format(x1.nbytes))
#print("System size of sub array (x): {} Bytes".format(sys.getsizeof(x1)))

size of sub array (x): 256000 Bytes


In [40]:
f.close()