### Task 0: Find the disk sizes of the csv and the hdf5 files

In [7]:
!ls -lh build/hetero1*

-rw-r--r-- 1 vscode vscode 8.4M Jan  4 09:53 build/hetero1.csv
-rw-r--r-- 1 vscode vscode 6.2M Jan  4 10:05 build/hetero1.h5


### Task 1: Profile the memory consumption of loading a slice of data from the CSV file into a numpy array

In [1]:
%%writefile src/01A_slice_from_csv.py

from memory_profiler import profile
import numpy as np
import h5py
import sys

@profile
def slice_from_csv(csv_filename):
    
    dt = np.dtype([('name','S20'),
                   ('city','S20'),
                   ('x','f8'),
                   ('y','f8'),
                   ('z','f8')])

    np_data = np.genfromtxt(csv_filename,delimiter=',',dtype=dt,names=True)
    x = np_data[1000:int(sys.argv[1])]
    return x
   

if __name__ == "__main__":

    csv_filename = 'build/hetero1.csv'
    x_csv = slice_from_csv(csv_filename)

Overwriting src/01A_slice_from_csv.py


**TODO: Find out why there is a significant difference in execution time (~30s) with and without memory profiling for the csv task**

In [4]:
%%bash
python src/01A_slice_from_csv.py 5000

Filename: /workspaces/hdf5-tutorial/python/src/01A_slice_from_csv.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     7     85.3 MiB     85.3 MiB           1   @profile
     8                                         def slice_from_csv(csv_filename):
     9                                             
    10     85.3 MiB      0.0 MiB           1       dt = np.dtype([('name','S20'),
    11                                                            ('city','S20'),
    12                                                            ('x','f8'),
    13                                                            ('y','f8'),
    14                                                            ('z','f8')])
    15                                         
    16    101.5 MiB     16.2 MiB           1       np_data = np.genfromtxt(csv_filename,delimiter=',',dtype=dt,names=True)
    17    101.5 MiB      0.0 MiB           1       x = np_data[1000:int(sys.argv[1])]
    18    101.5 MiB    

### Task 2: Profile the memory consumption of loading a slice of data from the HDF5 file into a numpy array

In [5]:
%%writefile src/01B_slice_from_h5.py

from memory_profiler import profile
import h5py
import numpy as np
import sys


@profile
def slice_from_h5(h5_filename):
    f = h5py.File(h5_filename)
    x = f['table'][1000:int(sys.argv[1])]
    f.close()
    return x
   

if __name__ == "__main__":

    h5_filename = 'build/hetero1.h5'
    x_h5 = slice_from_h5(h5_filename)

Overwriting src/01B_slice_from_h5.py


In [6]:
%%bash
time python src/01B_slice_from_h5.py 5000

Filename: /workspaces/hdf5-tutorial/python/src/01B_slice_from_h5.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     8     83.1 MiB     83.1 MiB           1   @profile
     9                                         def slice_from_h5(h5_filename):
    10     83.7 MiB      0.6 MiB           1       f = h5py.File(h5_filename)
    11     84.5 MiB      0.8 MiB           1       x = f['table'][1000:int(sys.argv[1])]
    12     84.3 MiB     -0.2 MiB           1       f.close()
    13     84.3 MiB      0.0 MiB           1       return x





real	0m0.477s
user	0m0.676s
sys	0m1.409s


### Task 3: Repeat Task 2 by changing the size of the slices and notice the increment in loading the data

Ans: Blocked memory accesses

In [42]:
%%bash
start_value=2000
end_value=15000
step=1000

# Loop through the values and execute the Python command
for ((arg=$start_value; arg<=$end_value; arg+=step)); do
    output=$(python -m memory_profiler src/01B_slice_from_h5.py $arg | grep 11 | awk '{print $4 $5}')
    echo $arg $output
done

2000 0.7MiB
3000 0.7MiB
4000 0.7MiB
5000 0.7MiB
6000 1.0MiB
7000 1.0MiB
8000 1.0MiB
9000 0.9MiB
10000 1.2MiB
11000 1.2MiB
12000 1.2MiB
13000 1.3MiB
14000 1.3MiB
15000 1.6MiB
