In [None]:
# default_exp tools

# Tools and utilities

> Measuring performance, storage and memory.

## File size

To report number of lines, is it faster to iterate through file in Python or use system `wc` utility?

In [None]:
# export
import subprocess

def wc_py(fpath):
    "Return number of lines in a text file, using Python I/O."
    with open(fpath) as f:
        line_count = 0
        for _ in f:
            line_count += 1
    return line_count

def wc_sys(fpath):
    "Return number of lines in a text file, using sytem 'wc' utility."
    p = subprocess.run(['wc', '-l', fpath], capture_output=True, text=True)
    return int(p.stdout.split()[0])

In [None]:
# hide
fpath = './README.md'
assert wc_py(fpath) == wc_sys(fpath)

In [None]:
# notest
fpath = './out/valid/2000.csv'
%time wc_py(fpath)
%time wc_sys(fpath)

CPU times: user 9 s, sys: 1.47 s, total: 10.5 s
Wall time: 10.4 s
CPU times: user 6.76 ms, sys: 0 ns, total: 6.76 ms
Wall time: 1.3 s


11169277

So it is faster to use sytem `wc` utility.

## List files in a folder

In [None]:
# export
from hurry.filesize import size

def lsdir(fdir):
    """Return list of strings like "file_name file_size number_of_lines" for all files in :fdir:."""
    fpaths = []
    for fname in os.listdir(fdir):
        fpath = os.path.join(fdir, fname)
        if not os.path.isfile(fpath):
            continue
        fpaths.append(fpath)
    
    info = ['Name\tLines\tSize']
    for fpath in sorted(fpaths):
        wc = wc_sys(fpath)
        sz = size(os.path.getsize(fpath))
        info.append(f'{fpath}\t{wc}\t{sz}')
    return info
