# Plot filesystem workload tests

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pandas as pd
from collections import defaultdict
import sys
import yaml
import math
import glob
import os
import json
from tqdm.notebook import tqdm
from IPython.display import display,Image
from datetime import date
from ncar_branding import *

# Unit Conversions
kb2mb = 1000
kb2gb = 1000*kb2mb
kb2tb = 1000*kb2gb

In [None]:
times = defaultdict(list)
paths = set()
hosts = set()
fname2path = {}
dfs = {}

In [None]:
def load_metadata():
    alldates = set()
    times.clear()
    dfs.clear() 
    
    for filename in tqdm(glob.glob('logs/*-GIT_CLONE.csv'), 
                         desc='Processing CSVs',
                         unit='file'): 
    
        dtype= {'Elapsed(s)'  : 'Int64',}
    
        #print(filename)
        host = None
        path = None
        with open(filename,'r') as f:
            for line in f:
                if '# ' in line:
                    line=line.strip().split()
                    #print(line[1], line[3])
                    host = line[1]
                    path = line[3]
                    break
    
        assert host
        assert path
        hosts.add(host)
        paths.add(path)
        fname2path[filename] = path
        
        df = pd.read_csv(filename,
                         skipinitialspace=True,
                         dtype=dtype,
                         comment='#',
                         parse_dates=['Date'],
                        )
        df.set_index('Date',inplace=True)
        alldates.update(set(df.index.to_list()))
            
    return (min(alldates), max(alldates)) 

In [None]:
def load_data(pattern):
    alltimes = set()
    times.clear()
    dfs.clear() 
    
    for filename in tqdm(glob.glob(pattern), 
                         desc='Processing CSVs',
                         unit='file'): 
    
        dtype= { 'Elapsed(s)'  : 'Int64',}
    
        #print(filename)
        host = None
        path = None
        with open(filename,'r') as f:
            for line in f:
                if '# ' in line:
                    line=line.strip().split()
                    #print(line[1], line[3])
                    host = line[1]
                    path = line[3]
                    break
    
        assert host
        assert path
        hosts.add(host)
        paths.add(path)
        fname2path[filename] = path
        
        #print(host, path)    
        
        df = pd.read_csv(filename,
                         skipinitialspace=True,
                         dtype=dtype,
                         comment='#',
                         parse_dates=['Date'],
                         date_format='ISO8601'
                        )
        df.set_index('Date',inplace=True)
    
        dfs[filename] = df
    
        alltimes.update(set(df['Elapsed(s)'].to_list()))
        
        times[path].extend(df['Elapsed(s)'].to_list())
        
        #fastest = np.max(fastest,df['Elapsed(s)'].max())
        #print(fastest)
        #display(df)
        #display(df.info(memory_usage=True))
    
        #print(pd.to_datetime(df.iloc[-1]))
        #break
    
    return (min(alltimes), max(alltimes)) 

In [None]:
def histogram(items,xlabel,plot_prefix=''):
    
    fig, ax = plt.subplots(1, 1,
                           figsize=(12,6))
    format_ax(ax)
    
    
    bins = np.linspace(fastest, slowest, 21)
    #print(bins)
    
    ts = list()
    ls = list()
    cs = list()
    
    for p in sorted(items):
        assert p in times
        t = times[p]
        ts.append(t)
        p = p.replace('/benkirk/fs_tests','')
        p = p.replace('/cisl/csg','')

        ls.append(p)
        cs.append(colors[p])
        
    ax.hist(ts, bins=bins,
            zorder=10,
            alpha=1,
            stacked=True,
            label=ls,
            color=cs),
    
    #ax.set_xlim(fastest, slowest)
    ax.set_ylim(-1,None)
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Count (#)')
    ax.legend(fancybox=True, loc=1)
    
    
    plt.show()
    fig.savefig(plot_prefix+'histogram.png', bbox_inches='tight', dpi=150, transparent=False)
    plt.close()
    return

In [None]:
def history(items,title,ylabel,plot_prefix=''):

    selected = set(items)
    
    fig, ax = plt.subplots(1, 1,
                           figsize=(12,6))
    format_ax(ax)

    plotted_already = set()
    
    for k in sorted(dfs.keys()):        
        df=dfs[k]
        #print(k)
        p = fname2path[k]
        if p not in items: continue
        p = p.replace('/benkirk/fs_tests','')
        p = p.replace('/cisl/csg','')
        #print(p)
        #print(df)
        for t in df.index:
            start = t
            duration = df.loc[t,'Elapsed(s)']
            end = start + pd.to_timedelta(duration, unit='seconds')
            #print(start,duration, end)

            ax.plot([start, end], 
                    [duration, duration], 
                    color=colors[p],
                    label=None if p in plotted_already else p,                    
                    linewidth=0 if p in plotted_already else 6)
            plotted_already.add(p)

            #ax.scatter(end if 'casper' in k else start, 
            #           duration, 
            #           color=colors[p],
            #           label=None,
            #           marker='$\mathrm{C}$' if 'casper' in k else '$\mathrm{D}$',
            #           s=64)     
            
            ax.annotate('c' if 'casper' in k else 'd',
                        (start,duration),
                        ha='center',
                        va='center',
                        color=colors[p],
                        weight='bold',
                        size=16)

    ax.set_xlim(time_bounds[0].value, time_bounds[1].value)
    #ax.set_ylim(0, slowest*1.05)
    ax.set_ylim(-1,None)
    if 'CESM git clone' in title:
        ax.set_ylim(0,1250)
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.set_xlabel('Date & Time')
    ax.legend(fancybox=True, loc=2)
    ax.tick_params(axis='x', labelrotation=60)
    
    plt.show()
    fig.savefig(plot_prefix+'history.png', bbox_inches='tight', dpi=150, transparent=False)
    plt.close()
    return

In [None]:
def fullhistory(items,title,ylabel,plot_prefix=''):
    # ref: https://matplotlib.org/stable/gallery/subplots_axes_and_figures/broken_axis.html
    selected = set(items)

    # If we were to simply plot pts, we'd lose most of the interesting
    # details due to the outliers. So let's 'break' or 'cut-out' the y-axis
    # into two portions - use the top (ax1) for the outliers, and the bottom
    # (ax2) for the details of the majority of our data
    fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True,
                                  figsize=(12,6))
    fig.subplots_adjust(hspace=0.05)  # adjust space between Axes
    
    format_ax(ax1)
    format_ax(ax2)

    plotted_already = set()
    yvals = []
    
    for k in sorted(dfs.keys()):        
        df=dfs[k]
        #print(k)
        p = fname2path[k]
        if p not in items: continue
        p = p.replace('/benkirk/fs_tests','')
        p = p.replace('/cisl/csg','')
        #print(p)
        #print(df)
        for t in df.index:
            start = t
            duration = df.loc[t,'Elapsed(s)']
            end = start + pd.to_timedelta(duration, unit='seconds')
            #print(start,duration, end)

            yvals.append(duration)
            
            for ax in ax1, ax2:
                ax.plot([start, end], 
                        [duration, duration], 
                        color=colors[p],
                        label=None if p in plotted_already else p,                    
                        linewidth=0 if p in plotted_already else 6)
                plotted_already.add(p)

                ax.annotate('c' if 'casper' in k else 'd',
                            (start,duration),
                            ha='center',
                            va='center',
                            color=colors[p],
                            weight='bold',
                            size=16)

    mean = np.mean(yvals)
    max = np.max(yvals)
    std = np.std(yvals)
    
    ax1.set_xlim(time_bounds[0].value, time_bounds[1].value)
    ax2.set_xlim(time_bounds[0].value, time_bounds[1].value)
    ax1.set_ylim(mean+std, max)
    ax2.set_ylim(0, 2*mean)

    ax2.set_ylabel(ylabel)
    ax1.set_title(title)
    ax2.set_xlabel('Date & Time')
    ax1.legend(fancybox=True, loc=2)
    ax2.tick_params(axis='x', labelrotation=60)

    # hide the spines between ax and ax2
    ax1.spines.bottom.set_visible(False)
    ax2.spines.top.set_visible(False)
    ax1.xaxis.tick_top()
    ax1.tick_params(labeltop=False)  # don't put tick labels at the top
    ax2.xaxis.tick_bottom()

    # Now, let's turn towards the cut-out slanted lines.
    # We create line objects in axes coordinates, in which (0,0), (0,1),
    # (1,0), and (1,1) are the four corners of the Axes.
    # The slanted lines themselves are markers at those locations, such that the
    # lines keep their angle and position, independent of the Axes size or scale
    # Finally, we need to disable clipping.
    d = .5  # proportion of vertical to horizontal extent of the slanted line
    kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12,
                  linestyle="none", color='k', mec='k', mew=1, clip_on=False)
    ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
    ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)

    plt.show()
    fig.savefig(plot_prefix+'fullhistory.png', bbox_inches='tight', dpi=150, transparent=False)
    plt.close()
    return

In [None]:
import ipywidgets as widgets
def selection(list=None,selected=None):
    if not selected:
        selected=list
    w = widgets.SelectMultiple(options=sorted(list),
                               value=selected,
                               rows=len(list),
                               description='CSVs',
                               layout={'width': 'initial'},
                               disabled=False)
    display(w)
    return w


def pick_date_range(date_bounds):
    begin = widgets.NaiveDatetimePicker(description='Begin Time',
                                   value=date_bounds[0],
                                   disabled=False)
    end = widgets.NaiveDatetimePicker(description='End Time',
                                   value=date_bounds[1],
                                   disabled=False)
    display(begin,end)
    return (begin,end)

In [None]:
date_bounds = load_metadata()
print(date_bounds)
colors=dict()
idx=0
for p in paths:
    p = p.replace('/benkirk/fs_tests','')
    p = p.replace('/cisl/csg','')
    colors[p]= 'C{}'.format(idx)
    idx += 1
print(colors)

In [None]:
time_bounds = pick_date_range(date_bounds)

sel = selection(sorted(list(paths)),
                selected=[x for x in sorted(list(paths)) if 'homefoobar' not in x] )

In [None]:
print(time_bounds[0].value,'\n', 
      time_bounds[1].value,'\n',
      sel.value)

## Test 1 - git clone times

In [None]:
fastest, slowest = load_data('logs/*-GIT_CLONE.csv')

In [None]:
#print(sel.value)
label='CESM git clone + ./manage_externals/checkout_externals'
histogram(sel.value, xlabel=label, plot_prefix='git_clone_')
history(sel.value, title=label, ylabel='CESM clone time (s)', plot_prefix='git_clone_')
fullhistory(sel.value, title=label, ylabel='CESM clone time (s)', plot_prefix='git_clone_')

## Test 2 - `tar cv`

In [None]:
fastest, slowest = load_data('logs/*-TAR.csv')

In [None]:
label='tar cf my_cesm_sandbox.tar my_cesm_sandbox/'
histogram(sel.value, xlabel=label,plot_prefix='tar_cv_')
history(sel.value, title=label, ylabel='tar creation time (s)',plot_prefix='tar_cv_')

## Test 3 - `dd if=/dev/zero of=0.dat bs=1M count=24000`

In [None]:
fastest, slowest = load_data('logs/*-DD.csv')

In [None]:
label='dd if=/dev/zero of=0.dat bs=1M count=24000'
histogram(sel.value, xlabel=label,plot_prefix='dd_')
history(sel.value, title=label, ylabel='dd 25GB write',plot_prefix='dd_')

## Test 4 - `rsync` (single file system)

In [None]:
fastest, slowest = load_data('logs/*-RSYNC.csv')

In [None]:
label='rsync (SRC/DEST on same file system)'
histogram(sel.value, xlabel=label,plot_prefix='rsync_')
history(sel.value, title=label, ylabel='rsync',plot_prefix='rsync_')

## Test 6 - cleanup

In [None]:
fastest, slowest = load_data('logs/*-CLEANUP.csv')

In [None]:
label='cleanup'
histogram(sel.value, xlabel=label,plot_prefix='cleanup_')
history(sel.value, title=label, ylabel='cleanup',plot_prefix='cleanup_')
fullhistory(sel.value, title=label, ylabel='cleanup',plot_prefix='cleanup_')