In [1]:
# Initial setup

try:
    plt_inline
except NameError:
    ## Matplotlib one-time configuration
    %matplotlib inline
    plt_inline = True
    
from inc.cache_effect import *

In [2]:
# Show when it happens in profiling and "full" execution.

In [2]:
## Load K-means experiments
parser = SparkParser()
apps = tuple(parser.parse_folder('data/hibench/kmeans'))

In [214]:
def input2samples(apps):
    def unique_input_sizes(apps):
        return set(a.stages[0].bytes_read for a in apps)
    
    b2s = {}
    k_samples = 32  # in thousands
    for byetes in sorted(unique_input_sizes(apps)):
        b2s[byetes] = k_samples
        k_samples *= 2

    return lambda b: b2s[b]

def is_target(app):
    return app.stages[0].bytes_read > 8 * 10**6

def filter_app(app): return is_target(app)

def get_df_records(apps, to_samples):
    ns_stages = set(len(app.stages) for app in apps)
    assert len(ns_stages) == 1
    n_stages = ns_stages.pop()
    # ['workers', 'samples', 'stage0read', 'stage0written', 'stage1read', ...]
    cols = ['workers', 'samples'] + ['stage{:d}{}'.format(stage, typ)
                                     for stage in range(n_stages)
                                     for typ in ('read', 'written')]
    rows = []
    for app in apps:
        row = [app.slaves, to_samples(app.stages[0].bytes_read)]
        for stage in app.stages:
            row.extend([stage.bytes_read / 1024, stage.bytes_written / 1024])
        rows.append(row)
    return rows, cols

In [215]:
target_apps = [app for app in apps if filter_app(app)]
to_samples = input2samples(apps)
records, cols = get_df_records(target_apps, to_samples)
df = pd.DataFrame.from_records(records, columns=cols)

In [217]:
my_app = None
uniq_inputs = sorted(set(a.stages[0].bytes_read for a in apps))
input_size = uniq_inputs[-1]
for app in target_apps:
    if app.slaves == 8 and app.stages[0].bytes_read == input_size:
        my_app = app
        break
if my_app is None:
    print('Could not find')
else:
    print('slaves: {:d}'.format(my_app.slaves))
    print('samples: {:d}k'.format(to_samples(input_size)))
    print('filename: ' + my_app.filename)

slaves: 8
samples: 16384k
filename: ../data/hibench/kmeans/app-20151121043329-0000


In [219]:
df[df.workers == 8]

Unnamed: 0,workers,samples,stage0read,stage0written,stage1read,stage1written,stage2read,stage2written,stage3read,stage3written,stage4read,stage4written,stage5read,stage5written,stage6read,stage6written,stage7read,stage7written,stage8read,stage8written,stage9read,stage9written,stage10read,stage10written,stage11read,stage11written,stage12read,stage12written,stage13read,stage13written,stage14read,stage14written,stage15read,stage15written,stage16read,stage16written,stage17read,stage17written,stage18read,stage18written,stage19read,stage19written,stage20read,stage20written,stage21read,stage21written,stage22read,stage22written,stage23read,stage23written,stage24read,stage24written,stage25read,stage25written
245,8,64,12551.55,0.0,24500.16,0.0,14000.16,0.0,28000.31,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,14000.16,3.07,3.07,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,12250.08,0.0
246,8,64,12551.55,0.0,24500.16,0.0,14000.16,0.0,28000.31,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,14000.16,3.23,3.23,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,12250.08,0.0
247,8,64,12551.55,0.0,24500.16,0.0,14000.16,0.0,28000.31,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,14000.16,3.11,3.11,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,14000.16,5.51,5.51,0.0,12250.08,0.0
248,8,64,12551.55,0.0,24500.16,0.0,14000.16,0.0,28000.31,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,14000.16,3.17,3.17,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,14000.16,5.53,5.53,0.0,12250.08,0.0
249,8,64,12551.55,0.0,24500.16,0.0,14000.16,0.0,28000.31,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,15750.23,0.0,14000.16,3.0,3.0,0.0,14000.16,5.52,5.52,0.0,14000.16,5.52,5.52,0.0,14000.16,5.52,5.52,0.0,14000.16,5.53,5.53,0.0,14000.16,5.52,5.52,0.0,12250.08,0.0
250,8,128,25102.72,0.0,49000.16,0.0,28000.16,0.0,56000.31,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,28000.16,3.09,3.09,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,24500.08,0.0
251,8,128,25102.72,0.0,49000.16,0.0,28000.16,0.0,56000.31,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,28000.16,3.1,3.1,0.0,28000.16,5.53,4.78,0.0,28000.16,5.52,5.52,0.0,28000.16,5.52,5.52,0.0,28000.16,5.53,5.53,0.0,28000.16,5.53,5.53,0.0,24500.08,0.0
252,8,128,25102.72,0.0,49000.16,0.0,28000.16,0.0,56000.31,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,28000.16,3.07,3.07,0.0,28000.16,5.54,4.25,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,24500.08,0.0
253,8,128,25102.72,0.0,49000.16,0.0,28000.16,0.0,56000.31,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,28000.16,3.07,3.07,0.0,28000.16,5.55,5.55,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,28000.16,5.54,5.54,0.0,24500.08,0.0
254,8,128,25102.72,0.0,49000.16,0.0,28000.16,0.0,56000.31,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,31500.23,0.0,28000.16,3.04,3.04,0.0,28000.16,5.52,5.52,0.0,28000.16,5.52,5.52,0.0,28000.16,5.52,5.52,0.0,28000.16,5.52,5.52,0.0,28000.16,5.52,5.52,0.0,24500.08,0.0


In [167]:
df.groupby(['workers', 'samples'], sort=False).std()['stage1read']

workers  samples
1        64          0.00
         128         0.00
         256         0.00
         512         0.00
         1024        0.00
         2048        0.00
         4096        0.00
2        64          0.00
         128         0.00
         256         0.00
         512         0.00
         1024        0.00
         2048        0.00
         4096        0.00
3        64          0.00
         128         0.00
         256         0.00
         512         0.00
         1024        0.00
         2048        0.00
         4096        0.00
4        64          0.00
         128         0.00
         256         0.00
         512         0.00
         1024        0.00
         2048        0.00
         4096        0.00
5        64          0.00
         128         0.00
         256         0.00
         512         0.00
         1024        0.00
         2048        0.00
         4096        0.00
6        64          0.00
         128         0.00
         256         

In [176]:
stages = [col for col in df.columns if col.startswith('stage')]
for (workers, samples), group in df.groupby(['workers', 'samples']):
    for stage in stages:
        values = group[stage]
        diff = values.max() - values.min()
        if diff > 0.01:
            #title = '{} workers, {}k samples, {}'.format(workers, samples, stage)
            #print(title)
            #print('-' * (len(title)))
            #print(values.value_counts())
            #print('std: {:.2f}'.format(values.std()), '\n')
            print('{:.0f}%'.format( (values.max() - values.min())/values.min() * 100 ))

8%
13%
41%
18%
14%
34%
24%
30%
21%
24%
35%
17%
14%
26%
26%
24%
32%
12%
0%
2%
2%
7%
7%
10%
17%
16%
20%
16%
19%
31%
3%
3%
3%
3%
3%
7%
14%
26%
7%
10%
9%
11%
11%
7%
13%
10%
6%
10%
9%
13%
13%
14%
12%
0%
8%
6%
8%
2%
7%
8%
9%
10%
14%
9%
12%
11%
14%
4%
5%
4%
5%
4%
