In [None]:
# default_exp tools

# Tools and utilities

> Measuring performance, storage and memory.

Useful metrics:

- on disk size
- runtime
- CPU utilization, average and peak
- RAM utilization, average and peak

In [None]:
# export
import os
import subprocess
from glob import glob

# File size

How much space files occupy on disk.

In [None]:
# export
def human_readable_size(byte_size):
    """Return human-readable size string, using base-10 prefixes."""
    if byte_size < 10**3:
        return f'{byte_size}B'
    if byte_size < 10**6:
        return f'{byte_size / 10**3:.1f}kB'
    if byte_size < 10**9:
        return f'{byte_size / 10**6:.1f}MB'
    return f'{byte_size / 10**9:.1f}GB'

def size_on_disk(*glob_paths):
    """Return total and individual file sizes."""
    result = {'total': None, 'files': []}
    for glob_path in glob_paths:
        for path in glob(glob_path):
            byte_size = os.path.getsize(path)
            human_size = human_readable_size(byte_size)
            result['files'].append((path, byte_size, human_size))
    result['files'].sort(key=lambda x: x[0])
    files_count = len(result['files'])
    total_byte_size = sum(x[1] for x in result['files'])
    total_human_size = human_readable_size(total_byte_size)
    result['total'] = (files_count, total_byte_size, total_human_size)
    return result

In [None]:
# notest
size_on_disk('./data/csv/1*.csv', './docs/*.json')

{'total': (5, 9401273617, '9.4GB'),
 'files': [('./data/csv/1997.csv', 2755917932, '2.8GB'),
  ('./data/csv/1998.csv', 3127243930, '3.1GB'),
  ('./data/csv/1999.csv', 3518111298, '3.5GB'),
  ('./docs/sidebar.json', 184, '184B'),
  ('./docs/tooltips.json', 273, '273B')]}

## Number of lines
To report number of lines, is it faster to iterate through file in Python or use system `wc` utility?

In [None]:
# export
def wc_py(fpath):
    "Return number of lines in a text file, using Python I/O."
    with open(fpath) as f:
        line_count = 0
        for _ in f:
            line_count += 1
    return line_count

def wc_sys(fpath):
    "Return number of lines in a text file, using sytem 'wc' utility."
    p = subprocess.run(['wc', '-l', fpath], capture_output=True, text=True)
    return int(p.stdout.split()[0])

In [None]:
# hide
fpath = './README.md'
assert wc_py(fpath) == wc_sys(fpath)

In [None]:
# notest
fpath = './out/valid/2000.csv'
%time wc_py(fpath)
%time wc_sys(fpath)

CPU times: user 9 s, sys: 1.47 s, total: 10.5 s
Wall time: 10.4 s
CPU times: user 6.76 ms, sys: 0 ns, total: 6.76 ms
Wall time: 1.3 s


11169277

So it is faster to use sytem `wc` utility.

## List files in a folder

In [None]:
# export
def lsdir(fdir):
    """Return list of strings like "file_name file_size number_of_lines" for all files in :fdir:."""
    fpaths = []
    for fname in os.listdir(fdir):
        fpath = os.path.join(fdir, fname)
        if not os.path.isfile(fpath):
            continue
        fpaths.append(fpath)
    
    info = ['Name\tLines\tSize']
    for fpath in sorted(fpaths):
        wc = wc_sys(fpath)
        sz = human_readable_size(os.path.getsize(fpath))
        info.append(f'{fpath}\t{wc}\t{sz}')
    return info


In [None]:
# notest
print(*lsdir('.'), sep='\n')

Name	Lines	Size
./.gitattributes	2	50B
./.gitconfig	18	492B
./.gitignore	146	1.6kB
./00_core.ipynb	184	4.8kB
./CONTRIBUTING.md	33	2.3kB
./LICENSE	201	11.4kB
./Makefile	30	416B
./README.md	122	3.1kB
./analysis.ipynb	411	12.1kB
./index.ipynb	249	6.4kB
./settings.ini	41	1.5kB
./setup.py	46	1.9kB
./storage.ipynb	449	15.2kB
./tools.ipynb	249	6.1kB


# Resource usage monitoring

[Memory usage](https://medium.com/survata-engineering-blog/monitoring-memory-usage-of-a-running-python-program-49f027e3d1ba)