In [1]:
import json
from datetime import date
from datetime import datetime,timedelta
import plotly.figure_factory as ff
from copy import deepcopy
import re
import numpy as np

In [2]:
def create_relative_timestamps(size,num_queues_gpu,num_queues_cpu,num_heads_on_cpu,log=True,folder='paramterised_dag_profiling'):
    if size!='default':
        if log:
            size = int(np.log(size)/np.log(2))
        else:
            size = int(size)
    with open("../profiling/dumps_transformer/{4}/{2}_GPU{0}_CPU{1}_{3}.json".format(num_queues_gpu,num_queues_cpu,size,num_heads_on_cpu,folder)) as f:
        timestamps = json.load(f)

    kernels = timestamps.keys()
    reference_device = {}
    reference_host = {}
    total_time = 0
    global_reference = None

    for kernel in kernels:
        device = timestamps[kernel]["device"]
        if device == 'gpu':
            t = timestamps[kernel]["write"]["device_start"]
        else:
            t = timestamps[kernel]["nd_range"]["device_start"]
        if t == -1:
            continue
        if not (device in reference_device):
            reference_device[device] = t
        else:
            reference_device[device] = min(reference_device[device],t)

        t = timestamps[kernel]["write"]["host_queued_end"]

        if not (device in reference_host):
            reference_host[device] = t
        else:
            reference_host[device] = min(reference_host[device],t)

        t = timestamps[kernel]["write"]["host_queued_start"]

        if not global_reference:
            global_reference = t
        else:
            global_reference = min(global_reference,t)

    relative_timestamps = deepcopy(timestamps)

    # global_reference = None

    # for key,value in reference_host.items():
    #     if not global_reference:
    #         global_reference = value
    #     else:
    #         global_reference = min(value,global_reference)


    for kernel,kernel_timestamps in relative_timestamps.items():
        device = kernel_timestamps["device"]
        for event_type,event_timestamps in kernel_timestamps.items():
            #print(event_type)
            if event_type in ["device","cmdq"]:
                continue
            else:
                #continue
                for sub_event_type in event_timestamps:
                    if  sub_event_type[:4] == "host":
                        event_timestamps[sub_event_type] -= global_reference
                        continue
                    else:
                       event_timestamps[sub_event_type] = event_timestamps[sub_event_type] - reference_device[device] + reference_host[device] - global_reference
    #                     event_timestamps[sub_event_type] = event_timestamps[sub_event_type] - reference_device[device] + \
    #                     kernel_timestamps["write"]["host_queued_start"] - global_reference
                    total_time = max(total_time,event_timestamps[sub_event_type])



    #print "Total Time Taken - ",total_time
    #print(json.dumps(relative_timestamps,sort_keys=True,indent=1))          
    return total_time,relative_timestamps

In [10]:
size = 2
log = False
min_time = None
min_config = {}
folder = "paramterised_dag_total_heads"
print "In this block the transformer size is 256 (fixed), and number of heads is {}".format(size)
default,_ = create_relative_timestamps(size,num_queues_gpu=1,num_queues_cpu=1,num_heads_on_cpu=0,log=log,folder=folder)
for num_queues_gpu in range(2,6):
    for num_queues_cpu in range(1,6):
        for num_heads_on_cpu in range(0,1):
            try:
                total_time,_ = create_relative_timestamps(size,num_queues_gpu,num_queues_cpu,num_heads_on_cpu,log=log,folder=folder)
                if (not min_time) or (total_time<min_time):
                    min_time = total_time
                    min_config['num_heads_on_cpu'] = num_heads_on_cpu
                    min_config['num_queues_gpu'] = num_queues_gpu
                    min_config['num_queues_cpu'] = num_queues_cpu

                print "Num Queues - GPU {} CPU {} CPU Heads {}/{} Total Time {}".format(num_queues_gpu,num_queues_cpu,num_heads_on_cpu,size,total_time)
            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except Exception as e:
                raise e
                print "Best Configuration for size {} is {}, %gain {}".format(size,min_config,100.0*(default-min_time)/default)
                raise e
print "Best Configuration for number of heads {} is {}, %gain {}".format(size,min_config,100.0*(default-min_time)/default)

In this block the transformer size is 256 (fixed), and number of heads is 2
Num Queues - GPU 2 CPU 1 CPU Heads 0/2 Total Time 0.0262787342072
Num Queues - GPU 2 CPU 2 CPU Heads 0/2 Total Time 0.0263209342957
Num Queues - GPU 2 CPU 3 CPU Heads 0/2 Total Time 0.0255696773529
Num Queues - GPU 2 CPU 4 CPU Heads 0/2 Total Time 0.025509595871
Num Queues - GPU 2 CPU 5 CPU Heads 0/2 Total Time 0.0263116359711
Num Queues - GPU 3 CPU 1 CPU Heads 0/2 Total Time 0.0257225036621
Num Queues - GPU 3 CPU 2 CPU Heads 0/2 Total Time 0.0265057086945
Num Queues - GPU 3 CPU 3 CPU Heads 0/2 Total Time 0.0256674289703
Num Queues - GPU 3 CPU 4 CPU Heads 0/2 Total Time 0.0256571769714
Num Queues - GPU 3 CPU 5 CPU Heads 0/2 Total Time 0.0256013870239
Num Queues - GPU 4 CPU 1 CPU Heads 0/2 Total Time 0.0258896350861
Num Queues - GPU 4 CPU 2 CPU Heads 0/2 Total Time 0.0258727073669
Num Queues - GPU 4 CPU 3 CPU Heads 0/2 Total Time 0.0259118080139
Num Queues - GPU 4 CPU 4 CPU Heads 0/2 Total Time 0.0259444713593
N

In [73]:
size = 256
log = True
min_time = None
min_config = {"num_heads_on_cpu" : 0,"num_queues" : 0}
default,_ = create_relative_timestamps(size,num_queues_gpu=1,num_queues_cpu=1,num_heads_on_cpu=0,log=log)
for num_queues in range(1,6):
    for num_heads_on_cpu in range(0,9):
        try:
            total_time,_ = create_relative_timestamps(size,num_queues,num_queues,num_heads_on_cpu)
            if (not min_time) or (total_time<min_time):
                min_time = total_time
                min_config['num_heads_on_cpu'] = num_heads_on_cpu
                min_config['num_queues'] = num_queues

            print "Num Queues - {} CPU Heads {}/8 Total Time {}".format(num_queues,num_heads_on_cpu,total_time)
        except KeyboardInterrupt:
            raise KeyboardInterrupt
        except Exception as e:
            print "Best Configuration for size {} is {}, %gain {}".format(size,min_config,100.0*(default-min_time)/default)
            raise e
print "Best Configuration for size {} is {}, %gain {}".format(size,min_config,100.0*(default-min_time)/default)

Num Queues - 1 CPU Heads 0/8 Total Time 0.0941708087921
Num Queues - 1 CPU Heads 1/8 Total Time 0.0830750465393
Num Queues - 1 CPU Heads 2/8 Total Time 0.0847082138062
Num Queues - 1 CPU Heads 3/8 Total Time 0.0888340473175
Num Queues - 1 CPU Heads 4/8 Total Time 0.0949902534485
Num Queues - 1 CPU Heads 5/8 Total Time 0.106530666351
Num Queues - 1 CPU Heads 6/8 Total Time 0.109370470047
Num Queues - 1 CPU Heads 7/8 Total Time 0.114281654358
Num Queues - 1 CPU Heads 8/8 Total Time 0.115064382553
Num Queues - 2 CPU Heads 0/8 Total Time 0.0862436294556
Num Queues - 2 CPU Heads 1/8 Total Time 0.0783286094666
Num Queues - 2 CPU Heads 2/8 Total Time 0.081981420517
Num Queues - 2 CPU Heads 3/8 Total Time 0.086430311203
Num Queues - 2 CPU Heads 4/8 Total Time 0.0908203125
Num Queues - 2 CPU Heads 5/8 Total Time 0.105103254318
Num Queues - 2 CPU Heads 6/8 Total Time 0.109485387802
Num Queues - 2 CPU Heads 7/8 Total Time 0.113208532333
Num Queues - 2 CPU Heads 8/8 Total Time 0.119625329971
Num Q

In [41]:
DEFAULT = False
if DEFAULT:
    total_time,relative_timestamps = create_relative_timestamps(size = size ,num_queues = 1,num_heads_on_cpu=0)
    num_queues = 1
else:
    total_time,relative_timestamps = create_relative_timestamps(size = size ,num_queues = min_config['num_queues'],num_heads_on_cpu=min_config['num_heads_on_cpu'])
    num_queues = min_config['num_queues']
    

    

In [42]:
def to_time_delta(timestamp):
    now = datetime.now()
    second = int(timestamp)
    microsecond = (timestamp - float(second))*1e6
    now = now.replace(hour=0, minute=0, second=second, microsecond=int(microsecond))
    return now
    

In [43]:
def modify_task_labels(df):
    for item in df:
        kernel_id = int(re.findall('\d+',item['Task'])[0])
        if item['Task'].startswith('FFC'):
            item['Task'] = '{}_gemm'.format(kernel_id)
        elif item['Task'].startswith('empty'):
            item['Task'] = '{}_copy'.format(kernel_id)
        elif item['Task'].startswith('softmax'):
            item['Task'] = '{}_softmax'.format(kernel_id)
        elif 'transpose' in item['Task']:
            item['Task'] = '{}_transpose'.format(kernel_id)
        

In [44]:
df = []
kernels = {}




for kernel,events in relative_timestamps.items():
    dev = events["device"]
    Task = "{}-{}".format(kernel,dev)
    ##events -> write,read,nd_range
    ##event  -> device_queued,device_start,device-end
#         for interval in intervals:
#             Resource=event
#             Start = now + timedelta(seconds=interval[0])
#             Finish = now + timedelta(seconds=interval[1])
#             df.append(dict(Task=Task,Start=Start,Finish=Finish,Resource=Resource))

    
    if events["write"]["device_start"] > 0:
        #write_overhead_start = to_time_delta(events["write"]["host_queued"])
        write_event_start = write_overhead_end = to_time_delta(events["write"]["device_start"])
        write_event_end = to_time_delta(events["write"]["device_end"])
        #df.append(dict(Task=Task,Start=write_overhead_start,Finish=write_overhead_end,Resource='overhead'))
        df.append(dict(Task=Task,Start=write_event_start,Finish=write_event_end,Resource='write'))
    

    
    
    exec_event_start  = to_time_delta(events["nd_range"]["device_start"])
    exec_event_end = to_time_delta(events["nd_range"]["device_end"])
    df.append(dict(Task=Task,Start=exec_event_start,Finish=exec_event_end,Resource='nd_range'))
    
    
    if events["read"]["device_start"] > 0:
        read_event_start = to_time_delta(events["read"]["device_start"])
        read_overhead_start = read_event_end = to_time_delta(events["read"]["device_end"])
    #read_overhead_end = to_time_delta(events["read"]["host_end"])
        df.append(dict(Task=Task,Start=read_event_start,Finish=read_event_end,Resource='read'))
    #df.append(dict(Task=Task,Start=read_overhead_start,Finish=read_overhead_end,Resource='overhead'))
    

#print(df)
    
    
# df = [dict(Task="Job-1", Start='2017-01-01', Finish='2017-02-02', Resource='Complete'),
#       dict(Task="Job-1", Start='2017-02-15', Finish='2017-03-15', Resource='Incomplete'),
#       dict(Task="Job-2", Start='2017-01-17', Finish='2017-02-17', Resource='Not Started'),
#       dict(Task="Job-2", Start='2017-01-17', Finish='2017-02-17', Resource='Complete'),
#       dict(Task="Job-3", Start='2017-03-10', Finish='2017-03-20', Resource='Not Started'),
#       dict(Task="Job-3", Start='2017-04-01', Finish='2017-04-20', Resource='Not Started'),
#       dict(Task="Job-3", Start='2017-05-18', Finish='2017-06-18', Resource='Not Started'),
#       dict(Task="Job-4", Start='2017-01-14', Finish='2017-03-14', Resource='Complete')]

colors = {'nd_range': 'rgb(220, 0, 0)',
          'read': (1, 0.9, 0.16),
          'write': 'rgb(0, 255, 100)',
         'overhead': 'rgb(0, 0, 255)'}

df.sort(key = lambda x : int(re.findall('\d+',x['Task'])[0]))
modify_task_labels(df)
fig = ff.create_gantt(df, colors=colors, index_col='Resource', show_colorbar=True,
                      group_tasks=True,width=1000,height=750)

fig.layout['xaxis']['tickformat'] = '%S%L'
fig.layout['title'] = 'Transformer 2 heads -  GPU, CQ {0} Total Time {1:.4f}s'.format(num_queues,total_time)
fig.layout['xaxis_title'] = 'Time (milli seconds)'
fig.layout['yaxis_title'] = 'Kernel'
#fig.layout['xaxis_range']=[to_time_delta(0.017),to_time_delta(0.025)]
#fig.layout['xaxis_range']=[to_time_delta(0),to_time_delta(0.020)]
#fig.layout['xaxis_range'] = [0,6]
fig.layout['font'] = dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f")



fig.show()

In [46]:
df = []
kernels = {}




for kernel,events in relative_timestamps.items():
    dev = events["device"]
    cq = events["cmdq"]
    #print kernel,cq
    Task = "{}_{}".format(dev,cq)
    ##events -> write,read,nd_range
    ##event  -> device_queued,device_start,device-end
#         for interval in intervals:
#             Resource=event
#             Start = now + timedelta(seconds=interval[0])
#             Finish = now + timedelta(seconds=interval[1])
#             df.append(dict(Task=Task,Start=Start,Finish=Finish,Resource=Resource))

    
    if events["write"]["device_start"] > 0:
        write_overhead_start = to_time_delta(events["write"]["host_queued_start"])
        write_event_start = write_overhead_end = to_time_delta(events["write"]["device_start"])
        write_event_end = to_time_delta(events["write"]["device_end"])
        #df.append(dict(Task=Task,Start=write_overhead_start,Finish=write_overhead_end,Resource='overhead'))
        df.append(dict(Task=Task,Start=write_event_start,Finish=write_event_end,Resource='write'))
    

    
    if kernel.startswith('FFC'):
        exec_resource = 'gemm'
    elif "transpose" in kernel:
        exec_resource = 'transpose'
    elif 'softmax' in kernel:
        exec_resource = 'softmax'
    else:
        exec_resource = 'copy'
    
    exec_event_start  = to_time_delta(events["nd_range"]["device_start"])
    exec_event_end = to_time_delta(events["nd_range"]["device_end"])
    df.append(dict(Task=Task,Start=exec_event_start,Finish=exec_event_end,Resource=exec_resource))
    
    
    if events["read"]["device_start"] > 0:
        read_event_start = to_time_delta(events["read"]["device_start"])
        read_overhead_start = read_event_end = to_time_delta(events["read"]["device_end"])
    #read_overhead_end = to_time_delta(events["read"]["host_end"])
        df.append(dict(Task=Task,Start=read_event_start,Finish=read_event_end,Resource='read'))
    #df.append(dict(Task=Task,Start=read_overhead_start,Finish=read_overhead_end,Resource='overhead'))
    

#print(df)
    
    
# df = [dict(Task="Job-1", Start='2017-01-01', Finish='2017-02-02', Resource='Complete'),
#       dict(Task="Job-1", Start='2017-02-15', Finish='2017-03-15', Resource='Incomplete'),
#       dict(Task="Job-2", Start='2017-01-17', Finish='2017-02-17', Resource='Not Started'),
#       dict(Task="Job-2", Start='2017-01-17', Finish='2017-02-17', Resource='Complete'),
#       dict(Task="Job-3", Start='2017-03-10', Finish='2017-03-20', Resource='Not Started'),
#       dict(Task="Job-3", Start='2017-04-01', Finish='2017-04-20', Resource='Not Started'),
#       dict(Task="Job-3", Start='2017-05-18', Finish='2017-06-18', Resource='Not Started'),
#       dict(Task="Job-4", Start='2017-01-14', Finish='2017-03-14', Resource='Complete')]

colors = {'copy': '#ff00ff',
          'read': (1, 0.9, 0.16),
          'write': 'rgb(0, 255, 100)',
         'overhead': 'rgb(0, 0, 255)',
         'gemm': '#dd3069',
         'transpose': '#a9a9d9',
         'softmax': '#3399cc'}

df.sort(key = lambda x : x['Task'])
#modify_task_labels(df)
fig = ff.create_gantt(df, colors=colors, index_col='Resource', show_colorbar=True,
                      group_tasks=True,width=1000,height=750)

fig.layout['xaxis']['tickformat'] = '%-L'
fig.layout['title'] = 'Transformer 2 heads -  GPU-GPU, CQ {0} Total Time {1:.4f}s'.format(num_queues,total_time)
fig.layout['xaxis_title'] = 'Time (milli seconds)'
fig.layout['yaxis_title'] = 'Command Queue'
fig.layout['xaxis_range']=[to_time_delta(0),to_time_delta(total_time)]
#fig.layout['xaxis_range'] = [0,6]
fig.layout['font'] = dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f")



fig.show()