In [None]:
import re
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Utils
def get_files(path, files):
    """get files list"""
    valid_f = re.compile(files)
    flist=[]
    os.chdir(path)
    for f in os.listdir('.'):
        if(valid_f.match(f) != None): flist.append(f) 
    return flist

def get_meta(l, d):
    """get meta data"""
    layer_re = re.compile('.+ Creating layer (.+)')
    iters_re = re.compile('Testing for\s+(\d+)\s+iterations.')
    th_re = re.compile('Number of OpenMP threads: (\d+)')
    batch_size_re = re.compile('batch_size: (\d+)')
    m = layer_re.findall(l)
    if(m is not []): layers = m

    m = iters_re.search(l)
    if(m is not None): d['iterations'] = int(m.group(1))    

    m = th_re.search(l)
    if(m is not None): d['threads'] = int(m.group(1))    

    m = batch_size_re.search(l)
    if(m is not None): d['batch size'] = int(m.group(1))    
    
    d['layers'] = set(layers)
    return d


def get_data(l, entry):
    """get the time of each layer and the total"""
    #total timing
    fd_re = re.compile('.+Average Forward pass:\s+(.+)\s+ms.')
    bd_re = re.compile('.+Average Backward pass:\s+(.+)\s+ms.')
    total_re = re.compile('.+Total Time:\s+(.+)\s+ms.')
    m = fd_re.search(l)
    if(m is not None): entry['avg forward'] = float(m.groups(1)[0])/1e3
    m = bd_re.search(l)
    if(m is not None): entry['avg backward'] = float(m.groups(1)[0])/1e3
    m = total_re.search(l)
    if(m is not None):
        entry['total time'] = float(m.groups(1)[0])/1e3
        entry['time per iteration'] = entry['total time']/entry['iterations']
    # layers timing
    ltime_re = re.compile('.+\s+(\w+)\s+(forward|backward):\s*(.+)\s+ms.')
    m = ltime_re.findall(l)
    layers_data = {lname+' '+direction: float(val)/1e3 for lname,direction,val in m}
    entry.update(layers_data)
    # layers memory footprint
    for layer in entry['layers']:
        lmem_re = re.compile('Creating layer\s+('+layer+').*Memory required for data:\s+(\d+)',re.DOTALL)
        m = lmem_re.search(l)
        if(m is not None): entry[m.groups(0)[0]+' memory'] = int(m.groups(0)[1])

def get_df(flist):
    """Parse the files in a dataframe"""
    data = []
    for f in flist:
        with open(f, 'r') as fp:
            entry = dict()
            txt = fp.read() #.split('\n')
            get_meta(txt, entry)
            get_data(txt, entry)
            entry['file name'] =f
            data.append(pd.DataFrame(entry))
    return pd.concat(data)

def normalize_batches(df):
    """Normalize the time by the batch size"""
    df_norm = df.copy()
    time_fields = [s for s in df.columns.values if('ward' in s or 'time' in s)]
    for f in time_fields:
        df_norm.loc[:, f] = df.loc[:, f]/df.loc[:, 'batch size']
    return df_norm

def normalize_time(df):
    """Normalize the time by the total time"""
    df_norm = df.copy()
    time_fields = [s for s in df.columns.values if('ward' in s  or 'time per iteration' in s)]
    for f in time_fields:
        df_norm.loc[:, f] = 100.0*df.loc[:, f]/df.loc[:, 'time per iteration']
    del df_norm['total time']
    return df_norm

def group_small_entries(df, threas):
    """Sum insignificant entries in one column that fall
    below the 'threas' percentile of the total time"""
    df_filt = df.copy()
    df_filt['others'] = 0.0
    df_norm_time = normalize_time(df)
    time_fields = [s for s in df.columns.values if('ward' in s)]
    for f in time_fields:
        if(not all(df_norm_time[f].apply(lambda x: x > threas))):
            df_filt['others'] = df_filt['others'] + df_filt[f]
            del df_filt[f]
    return df_filt

def plot_batch_scaling(df, threas):
    """Plot batch scaling from a data frame table"""
    df_norm_batch = normalize_batches(df)
    df_filt = group_small_entries(df_norm_batch, threas)

    layers_cols = [s for s in df_filt.columns.values if('ward' in s and not 'avg' in s)]
    layers_cols = layers_cols + ['others']
    df_filt.index = df_filt['batch size']
    plt_data = pd.DataFrame(df_filt[layers_cols],index=df_filt['batch size'], columns=layers_cols)

    plt_data= plt_data.sort_index()

    ax = plt_data.plot(marker='o', stacked=True)
    ax.set_xscale('log',basex=2)
    ax.set_ylabel('Stacked time per batch size per iter. (seconds)')
    ax.set_xlabel('Batch size', size=20)
    return ax

def plot_thread_scaling(df, threas):
    """Plot thread scaling from a data frame table"""
    df_filt = group_small_entries(df, threas)

    layers_cols = [s for s in df_filt.columns.values if('ward' in s and not 'avg' in s)]
    layers_cols = layers_cols + ['others']
    df_filt.index = df_filt['threads']
    plt_data = pd.DataFrame(df_filt[layers_cols],index=df_filt['threads'], columns=layers_cols)

    plt_data= plt_data.sort_index()

    ax = plt_data.plot(marker='o')
    ax.set_xscale('log',basex=2)
    ax.set_yscale('log')
    ax.set_ylabel('Time per iter. (seconds)')
    ax.set_xlabel('Threads #')
    return ax

In [None]:
flist = get_files('/Users/tmalas/Desktop/caffe/atlas_caffe_scripts/batch_scaling_136th_knl/','.*.out$')
df = get_df(flist)
ax = plot_batch_scaling(df, 3.0)
ax.set_title('KNL batch scaling (Stacked, sums to total time)')
ax.set_ylim(ymin=0)
plt.savefig('KNL batch scaling.pdf', format='pdf')

In [None]:
flist = get_files('/Users/tmalas/Desktop/caffe/atlas_caffe_scripts/batch_scaling_16threads_hsw/','.*.out$')
df = get_df(flist)
ax = plot_batch_scaling(df, 3.0)
ax.set_title('Haswell batch scaling (Stacked, sums to total time)')
ax.set_ylim(ymin=0)
plt.savefig('Haswell batch scaling.pdf', format='pdf')

In [None]:
flist = get_files('/Users/tmalas/Desktop/caffe/atlas_caffe_scripts/thread_scaling_1batch_hsw/','.*.out$')
df = get_df(flist)
ax = plot_thread_scaling(df, 3.0)
ax.set_title('Haswell thread scaling')
ax.set_ylim(ymin=0)
plt.savefig('Haswell thread scaling.pdf', format='pdf')

In [None]:
flist = get_files('/Users/tmalas/Desktop/caffe/atlas_caffe_scripts/thread_scaling_1batch_knl/','.*.out$')
df = get_df(flist)
ax = plot_thread_scaling(df, 3.0)
ax.set_title('KNL thread scaling')
ax.set_ylim(ymin=0)
plt.savefig('KNL thread scaling.pdf', format='pdf')