In [None]:
# default_exp tools

# Tools and utilities

> Measuring performance, storage and memory.

Useful metrics:

- on disk size
- runtime
- CPU utilization, average and peak
- RAM utilization, average and peak

In [None]:
# export
import os
import subprocess
from glob import glob

# File size

How much space files occupy on disk.

In [None]:
# export
def human_readable_size(byte_size):
    """Return human-readable size string, using base-10 prefixes."""
    if byte_size < 10**3:
        return f'{byte_size}B'
    if byte_size < 10**6:
        return f'{byte_size / 10**3:.1f}kB'
    if byte_size < 10**9:
        return f'{byte_size / 10**6:.1f}MB'
    return f'{byte_size / 10**9:.1f}GB'

def size_on_disk(*glob_paths):
    """Return total and individual file sizes."""
    result = {'total': None, 'files': []}
    for glob_path in glob_paths:
        for path in glob(glob_path):
            byte_size = os.path.getsize(path)
            human_size = human_readable_size(byte_size)
            result['files'].append((path, byte_size, human_size))
    result['files'].sort(key=lambda x: x[0])
    files_count = len(result['files'])
    total_byte_size = sum(x[1] for x in result['files'])
    total_human_size = human_readable_size(total_byte_size)
    result['total'] = (files_count, total_byte_size, total_human_size)
    return result

In [None]:
# notest
size_on_disk('./data/csv/1*.csv', './docs/*.json')

## Number of lines
To report number of lines, is it faster to iterate through file in Python or use system `wc` utility?

In [None]:
# export
def wc_py(fpath):
    "Return number of lines in a text file, using Python I/O."
    with open(fpath) as f:
        line_count = 0
        for _ in f:
            line_count += 1
    return line_count

def wc_sys(fpath):
    "Return number of lines in a text file, using sytem 'wc' utility."
    p = subprocess.run(['wc', '-l', fpath], capture_output=True, text=True)
    return int(p.stdout.split()[0])

In [None]:
# hide
fpath = './README.md'
assert wc_py(fpath) == wc_sys(fpath)

In [None]:
# notest
fpath = './out/valid/2000.csv'
%time wc_py(fpath)
%time wc_sys(fpath)

So it is faster to use sytem `wc` utility.

## List files in a folder

In [None]:
# export
def lsdir(fdir):
    """Return list of strings like "file_name file_size number_of_lines" for all files in :fdir:."""
    fpaths = []
    for fname in os.listdir(fdir):
        fpath = os.path.join(fdir, fname)
        if not os.path.isfile(fpath):
            continue
        fpaths.append(fpath)
    
    info = ['Name\tLines\tSize']
    for fpath in sorted(fpaths):
        wc = wc_sys(fpath)
        sz = human_readable_size(os.path.getsize(fpath))
        info.append(f'{fpath}\t{wc}\t{sz}')
    return info


In [None]:
# notest
print(*lsdir('.'), sep='\n')

# Resource usage monitoring

Can be done from outside of process - which measures process as a whole, or from inside. Outside is easier, but less precise.

[Memory usage](https://medium.com/survata-engineering-blog/monitoring-memory-usage-of-a-running-python-program-49f027e3d1ba) - medium article.


Here is a resource monitor class that watches a given process from a subprocess. It uses cross-platform `psutil` package to read process information. I/O stats are not available on MacOS.


To test disk I/O speed on Linux:
- write: `sync; dd if=/dev/zero of=tempfile bs=1M count=1024; sync`
- read: `dd if=tempfile of=/dev/null bs=1M count=1024`

In [None]:
#export
import sys
import os
import time
import json
import subprocess
import inspect
import warnings

import psutil
from psutil._common import bytes2human

In [None]:
#export

def usage_log(pid, interval=1):
    """Regularly write resource usage to stdout."""
    # local imports make function self-sufficient
    import time, psutil
    
    if psutil.MACOS:
        warnings.warn('Disk I/O stats are not available on MacOS.')
    
    p = psutil.Process(pid)

    def get_io():
        if psutil.MACOS:
            # io_counters() not available on MacOS
            return (0, 0, 0, 0)
        else:
            x = p.io_counters()
            return (x.read_bytes, x.read_chars, x.write_bytes, x.write_chars)
    
    print('time,cpu,memory,read_bytes,read_chars,write_bytes,write_chars')
    p.cpu_percent()
    io_before = get_io()
    while True:
        time.sleep(interval)
        io_after = get_io()
        io_rate = tuple((x1 - x0) / interval for x0, x1 in zip(io_before, io_after))
        io_before = io_after
        line = (time.time(), p.cpu_percent(), p.memory_info().rss) + io_rate
        print(','.join(str(x) for x in line))
        
    
class ResourceMonitor:
    def __init__(self, pid=None, interval=1):
        self.pid = os.getpid() if pid is None else pid
        self.interval = interval
        self.tags = []
        
    def start(self):
        code = inspect.getsource(usage_log) + f'\nusage_log({self.pid}, {self.interval})'
        self.process = subprocess.Popen([sys.executable, '-c', code], text=True,
                                        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
    def stop(self):
        self.process.send_signal(subprocess.signal.SIGINT)
        import pandas as pd
        self.process.wait(3)
        df = pd.read_csv(self.process.stdout)
        df['elapsed'] = df['time'] - df.loc[0, 'time']
        self.df = df.set_index('elapsed')
        
    def tag(self, label):
        self.tags.append((time.time(), label))
        
    def plot(self):
        import matplotlib.pyplot as plt
        fig, axes = plt.subplots(2, 2, figsize=(12, 8))
        
        ax = axes[0][0]
        ax.plot(self.df['cpu'])
        ax.set_title('cpu')
        
        ax = axes[1][0]
        ax.plot(self.df['memory'])
        ax.set_title('memory')
        ax.set_yticklabels([bytes2human(x) for x in ax.get_yticks()])
        
        ax = axes[0][1]
        ax.plot(self.df['read_bytes'], label='bytes')
        ax.plot(self.df['read_chars'], label='chars')
        ax.set_title('read')
        ax.legend()
        ax.set_yticklabels([bytes2human(x) for x in ax.get_yticks()])
        
        ax = axes[1][1]
        ax.plot(self.df['write_bytes'], label='bytes')
        ax.plot(self.df['write_chars'], label='chars')
        ax.set_title('write')
        ax.legend()
        ax.set_yticklabels([bytes2human(x) for x in ax.get_yticks()])
        
        t0 = self.df.loc[0, 'time']
        for ax in axes.flatten():
            y = min(l.get_data()[1].min() for l in ax.lines)
            for tag in self.tags:
                ax.text(tag[0] - t0, y, tag[1], rotation='vertical')

    def dump(self, filepath):
        d = {'tags': self.tags,
             'data': self.df.to_csv()}
        json.dump(d, open(filepath, 'w'))

    @classmethod
    def load(cls, filepath):
        import io
        d = json.load(open(filepath))
        m = cls()
        m.tags = d['tags']
        m.df = pd.read_csv(io.StringIO(d['data'])).set_index('elapsed')
        return m

In [None]:
# TEST
from tempfile import TemporaryFile, NamedTemporaryFile

def use_cpu(t):
    t0 = time.time()
    while time.time() - t0 < t:
        x = 1

def use_mem(s, n):
    x = []
    for _ in range(n):
        x += [1] * s * 1_000_000
        time.sleep(1)

def write(f, size_mb):
    size = size_mb * 2**20
    count = 0
    block_size = 8 * 2**10
    data = b'a' * block_size
    f.seek(0)
    while count < size:
        count += f.write(data)
        f.flush()

def read(f):
    block_size = 8 * 2**10
    f.seek(0)
    while f.peek():
        f.read(block_size)

mon = ResourceMonitor(interval=0.1)
mon.start()
time.sleep(2)
mon.tag('cpu v')
use_cpu(2)
mon.tag('cpu ^')
time.sleep(1)
mon.tag('mem1 v')
use_mem(30, 2)
mon.tag('mem1 ^')
time.sleep(1)
mon.tag('mem2 v')
use_mem(10, 2)
mon.tag('mem2 ^')
time.sleep(1)
with TemporaryFile() as tf:
    mon.tag('write v')
    write(tf, 1000)
    mon.tag('write ^')
    time.sleep(1)
    mon.tag('read v')
    read(tf)
    mon.tag('read ^')
time.sleep(1)
mon.stop()
mon.plot()

In [None]:
# TEST serialization
m1 = ResourceMonitor(interval=0.2)
m1.start()
time.sleep(1)
m1.tag('start')
use_cpu(2)
m1.tag('stop')
time.sleep(1)
m1.stop()

with NamedTemporaryFile() as tf:
    m1.dump(tf.name)
    m2 = ResourceMonitor.load(tf.name)
    m2.plot()