In [6]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import pandas as pd
import re
from datetime import datetime, timedelta
from dateutil import parser

# Plotly inline
init_notebook_mode(connected=True)

In [38]:
# Generate graphs
def parse_logs(filename_pairs_arr, hours=1):
    """"Function to generate plotly graph of CellRanger Monitor Logs with CellRanger std_out Annotations
    Inputs:
        filename_pairs_arr: Array of usage monitoring logs and matching annotation logs in this format-
            [["monitroig_log.log","corresponding_std_out.txt"], ["monitroig_log2.log","corresponding_std_out2.txt"]]
        hours: default length of x axis in hours
    Outpus:
        Plotly Graph of Maps: The Legend items are grouped by usage type, disk, core and mem. The title is the VM stats of the run.
                             The CellRanger task names are outputted in the middle of the graph vertically at the time they occured.
                             The xaxis starts at the time of the first task started, and is defaulted to have the range of hours input.
    """
    # Collection Arrays
    # Data: Plotly traces of line graphs of usages
    data = []
    # Start times of every task
    earlys = []
    # Cellranger Tasks
    events = []
    # Iterate through filename pairs and populate collection arrays
    for filename_pair in filename_pairs_arr:
        # monitoring log is first file
        monitor = filename_pair[0]
        # std out is second file
        std = filename_pair[1]
        # read std out file
        with open(std) as b:
            lines = b.readlines()
        # parse through the std out file to get major task names
        for line in lines:
            # theres a bunch of meta output that we ignore like copyright info
            # lines that matter have "[runtime]" in them, so get them
            if "[runtime]" in line:
                # split the line into components we want
                # EXAMPLE LINE WE WANT
                """"
                2018-07-16 21:13:45 [runtime] (ready)           ID.HJCVJBGX5.MAKE_FASTQS_CS.MAKE_FASTQS.PREPARE_SAMPLESHEET
                """
                lline_arr = line.strip().split()
                # important tasks have '(ready)' after '[runtime]'
                if lline_arr[3] == '(ready)':
                    # parse date
                    dt = parser.parse(' '.join(lline_arr[0:2]))
                    # Get the event name, can be different formats, this should generally get all of them into right format though
                    event_name = ''.join(lline_arr[4:]).split(".")[-1]
                    # add formatted event [datetime, event_name] to collection
                    events = events + [[dt, event_name]]       
        # Open the monitor log and parse it
        with open(monitor) as f:
            lines = f.readlines()
        
        # Read Caps of Stats we monitor, each one is parsed and converted to float
        cpu_cap = int(re.sub("[^$0-9.]","", [line for line in lines if 'CPU:' in line][0]))
        mem_cap = float(re.sub("[^$0-9.]","", [line for line in lines if 'Total Memory:' in line][0]))
        disk_cap = float(re.sub("[^$0-9.]","", [line for line in lines if 'Total Disk space:' in line][0]))

        # Collections for plotly trace
        # cpu usages
        cpu = []
        # mem usages
        mem = []
        # disk usages
        disk = []
        # measurement times
        time = []
        # parse through each line and add it to collection
        # EXAMPLE MONITORING LINE GROUP
        """"
        [Tue Jul 17 04:14:59 UTC 2018]
        * CPU usage: 8.1%
        * Memory usage: 6%
        * Disk usage: 57%
        """
        for line in lines:
            if 'CPU usage:' in line:
                # if it's a cpu line parse to float and add to cpu collector
                cpu = cpu + [float(re.sub("[^$0-9.]","", line))]
            elif 'Memory usage:' in line:
                # if it's a mem line parse to float and add to mem collector
                mem = mem + [float(re.sub("[^$0-9.]","", line ))]
            elif 'Disk usage:' in line:
                # if it's a disk line parse to float and add to disk collector
                disk = disk + [float(re.sub("[^$0-9.]","", line))]
            elif '[' in line:
                # if it's a time line parse to datetime object and add to time collector
                time = time + [parser.parse(line.replace("[", '').replace("]", '').strip())]
        # create a dataframe to make adding to plotly easier
        df = pd.DataFrame(data={'cores': cpu, 'memory usage (GB)': mem, 'disk usage (GB)': disk, 'time':time})
        # create the title- eg "Usage: 64.0 Cores, 236.0 G Memory, 394.0 G Disk Space" 
        title = "Usage: " + str(cpu_cap) + " Cores, " + str(mem_cap) + " GB Memory, " + str(disk_cap) + " GB Disk Space"  
        # CPU usage trace (Note the legend group)
        trace1 = go.Scatter(
            x = df.time,
            y = df['cores'],
            name = monitor + ' core usage',
            legendgroup =  'core usage',
        )

        # Memory Usage trace (Note the legend group)
        trace2 = go.Scatter(
            x = df.time,
            y = df['memory usage (GB)'],
            name = monitor + ' memory usage (GB)',
            legendgroup =  'memory usage (GB)',
        )

        # Disk Usage trace (Note the legend group)
        trace3 = go.Scatter(
            x = df.time,
            y = df['disk usage (GB)'],
            name = monitor + ' disk usage (GB)',
            legendgroup =  'disk usage (GB)',
        )
        
        # Add to data collector for plotly
        data = data + [trace1, trace2, trace3]
        
        # get the earliest time (aka start of task) and add it to early collector
        earlys = earlys + [min(time)]
    # TODO figure out a better way to make sure these annotations, which can be long, do not overlap
    def height(num, x =40, maxh =60, by =10):
        """Generator to alternate heights of annotation labels
            Inputs:
                num: number of heights we want to output
                minh: min height of annotation
                maxh: max height of annotation
                by: amount to increase height every time
            Outputs:
                Yields varying heights from minh, minh + by... maxh, minh, minh + by... 
        """
        n = 0
        minh = 40
        while n < num:
            if minh == maxh:
                yield x
                minh = 40
                n +=1 
            yield minh
            minh += by
            n += 1
    # get heights of annotations
    heights = height(len(events))
    # create annotations layout list
    annotations = []
    for event in sorted(events, key=lambda x : x[0]):
        annotation = dict(x = event[0], y = next(heights), 
                                           xref = 'x', yref = 'y', text = event[1], 
                                           textangle = -45, showarrow = False)
        annotations = annotations + [annotation]
    # generate layout using collectors
    layout = dict(title = title,
                  xaxis = dict(title = 'Time', range = [min(earlys), min(earlys)+ timedelta(hours=hours)]),
                  yaxis = dict(title = '% Usage', range = [0,100]),
                 annotations = annotations)
    # plot
    iplot({'data':data, 'layout':layout})