## Setting up visualization portion of project

In [1]:
# visualization tools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

In [2]:
# data tools
import datetime as dt
import numpy as np
from random import sample
from random import choice
from scipy import stats

In [3]:
# access to .csv job files
from os import listdir

In [4]:
# collect file names for all jobs
train_files = [ '../jobs/train/'+file for file in listdir('../jobs/train/') if file.startswith('comet') ]
test_files = [ '../jobs/test/'+file for file in listdir('../jobs/test/') if file.startswith('comet')]

all_files = [ *train_files, *test_files]

In [5]:
len(all_files)

10617

### Parse for trends
**Notes:**
    * Certain categories have been pre-selected to further analysis:
        * intel_hsw: Intel Haswell Processor (HSW) (per core)
        * intel_rapl: Running average power limit
        * ib: Infiniband usage
        * ib_sw: InfiniBand usage
        * ib_ext: Infiniband usage
        * llite: Lustre filesystem usage (per mount),
        * lnet: Lustre network usage
        * mdc: Lustre network usage
        * osc: Lustre filesystem usage
        * block: block device statistics (per device)
        * cpu: scheduler accounting (per CPU)
        * mem: memory usage (per socket)
        * net: network device usage (per device)
        * nfs: NFS system usage
        * proc: Process specific data (MaxRSS, executable name etc.)
        * sysv_shm SysV shared memory segment usage
        * tmpfs: ram-backed filesystem usage (per mount)
        * vm: virtual memory statistics
    * Extreme case will also be evaluated

In [33]:
all_jobs = []
norm_jobs = []

In [42]:
cols = {
    "amd64_pmc": "AMD Opteron performance counters (per core)",
        "intel_hsw": "Intel Haswell Processor (HSW) (per core)",
        "intel_hsw_ht": "Intel Haswell Processor - Hyper-threaded (per logical core)",
        "intel_nhm": "Intel Nehalem Processor (NHM) (per core)",
        "intel_uncore": "Westmere Uncore (WTM) (per socket)",
        "intel_snb": "Intel Sandy Brige (SNB) or Ivy Bridge (IVB) Processor (per core)",
        "intel_rapl": "Running average power limit",
        "intel_hsw_cbo": "Caching Agent (CBo) for SNB (HSW) (per socket)",
        "intel_hsw_pcu": "Power Control Unit for SNB (HSW) (per socket)",
        "intel_hsw_imc": "Integrated Memory Controller for SNB (HSW) (per socket)",
        "intel_hsw_qpi": "QPI Link Layer for SNB (HSW) (per socket)",
        "intel_hsw_hau": "Home Agent Unit for SNB (HSW) (per socket)",
        "intel_hsw_r2pci": "Ring to PCIe Agent for SNB (HSW) (per socket)",
        "ib": "Infiniband usage (default)",
        "ib_sw": "InfiniBand usage (sw)",
        "ib_ext": "Infiniband usage (ext)",
        "llite": "Lustre filesystem usage (per mount)",
        "lnet": "Lustre network usage (lnet)",
        "mdc": "Lustre network usage (mdc)",
        "mic": "MIC scheduler account (per hardware thread)",
        "osc": "Lustre filesystem usage (osc)",
        "block": "Block device statistics (per device)",
        "cpu": "Scheduler accounting (per CPU)",
        "mem": "Memory usage (per socket)",
        "net": "Network device usage (per device)",
        "nfs": "NFS system usage",
        "numa": "NUMA statistics (per socket)",
        "proc": "Process specific data (MaxRSS, executable name etc.)",
        "ps": "Process statistics",
        "sysv_shm": "SysV shared memory segment usage",
        "tmpfs": "Ram-backed filesystem usage (per mount)",
        "vfs": "Dentry_file_inode cache usage",
        "vm": "Virtual memory statistics"
       }

In [None]:
def missing_cols ( ):
    return [ df[col] for df in all_jobs if df[col] not in cols.values() ]

missing_cols()

In [None]:
def unused_cols ( ret=False ):
    for df in all_jobs:
        found = [ col in df for ]
            if all( val == False for val in ):
                

unused_cols()

In [35]:
# normalize all data values in DataFrame
def clean ( df ):
    
    for val in cols.values():
        try:
            df[[title]] = normalize( df[[title]], axis=0, norm='max', copy=False )
        except:
            next
        
    return df

In [36]:
# extreme values
most_cycles = ( pd.DataFrame() )
most_stats = ( pd.DataFrame() )

for i in range( len(all_files) ):
    df = pd.read_csv( all_files[i] ).drop("Cycle", 1)
    
    # make sure job ran for at least one hour
    if (df.shape[0] > 6):
        all_jobs.append( df )
        norm_jobs.append( clean(df) )
    
        # find job with most cycles of collected stats
        if (df.shape[0] > most_cycles.shape[0]): most_cycles = df
        # find job with most types of stats
        if (df.shape[1] > most_stats.shape[1]): most_stats = df

In [37]:
len(all_jobs)

6098

In [38]:
most_cycles.shape

(526, 23)

In [39]:
most_stats.shape

(7, 26)

### Minor Data Cleaning

In [17]:
def ordinalize ( x ):
    x = abs(int(x))
    
    if (x % 10 < 4) or (x % 100 < 4):
        return {
            1: f"{x}st",
            2: f"{x}nd",
            3: f"{x}rd",
        }.get(x % 10, "th")
    else:
        return f"{x}th"

In [16]:
def cycl_mean_all ( ):
    return int(np.mean([ df.shape[0] for df in all_jobs ]))
    
def cycl_mean_spec ( col ):
    return int(np.mean([ df.shape[0] for df in all_jobs if col in df]))    

In [18]:
# find the most frequent number of cycles for all jobs
def mode_all ( ):
    return int( stats.mode( [df.shape[0] for df in all_jobs] )[0][0] )

# find the most frequent number of cycles for a specific set
def mode_spec ( col ):
    return int( stats.mode( [df.shape[0] for df in all_jobs if col in df] )[0][0] )

In [19]:
def ymax ( col, jobs=all_jobs ):
    ymax = 0
    
    for df in jobs:
        if col in df and df[col].max() > ymax:
            ymax = df[col].max()
    return ymax

### matplotlib with seaborn

In [20]:
themes = ['inferno', 'ocean', 'tab20c', 'winter', 'summer', 'Wistia', 'hot', 'bone', 'pink',
         'BuGn', 'Blues', 'Purples', 'GnBu', 'YlGn', 'plasma', 'magma', 'viridis', 'BuPu', 'Blues']

In [22]:
def plot_all ( col, mode=mode_all() ):
    modeList = [ df for df in all_jobs if df.shape[0] == mode ]
    
    for job in modeList:
        try:
            plt.plot( job[col] )
        except:
            next
    
    plt.grid( True )
    plt.gcf().set_size_inches(15, 11)
    sns.despine()
    
def plot_n ( col, n, mode=mode_all() ):
    modeList = [ df for df in all_jobs if df.shape[0] == mode ]
    s = sample( modeList , n )
    
    for job in s:
        try:
            plt.plot( job[col] )
        except:
            next
    
    plt.grid( True )
    plt.gcf().set_size_inches(15, 11)
    sns.despine()

In [43]:
all_jobs[0]

Unnamed: 0,Block device statistics (per device),Scheduler accounting (per CPU),Infiniband usage (default),Infiniband usage (ext),InfiniBand usage (sw),Intel Haswell Processor (HSW) (per core),Caching Agent (CBo) for SNB (HSW) (per socket),Home Agent Unit for SNB (HSW) (per socket),Integrated Memory Controller for SNB (HSW) (per socket),Power Control Unit for SNB (HSW) (per socket),...,Memory usage (per socket),Network device usage (per device),NFS system usage,NUMA statistics (per socket),"Process specific data (MaxRSS, executable name etc.)",Process statistics,SysV shared memory segment usage,Ram-backed filesystem usage (per mount),Dentry/file/inode cache usage,Virtual memory statistics
0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,...,8574070000.0,0.0,0.0,0.0,231096.867,141.286,0.0,0.5,7446.667,0.0
1,1115006.636,8570.0,461.765,443951.3,1105668.0,49597210.0,5923230.0,181652100000.0,13253004.4,250490200000.0,...,8574947000.0,4644.696,0.0,14254.333,231096.867,27892.429,0.0,0.5,7859.667,1203765.667
2,1417809.364,17139.714,920.706,877918.5,2186563.0,100262300.0,12353320.0,363787500000.0,26476441.2,500975500000.0,...,8575076000.0,9477.826,0.0,29036.333,231096.867,56492.143,0.0,0.5,7908.0,1562784.81
3,2017781.0,25709.286,1394.176,1322207.5,3293010.0,160646500.0,19040760.0,545599800000.0,39772440.4,751442100000.0,...,8576113000.0,14803.435,0.0,51610.0,231096.867,85169.429,0.0,0.5,7958.333,2291466.81
4,2101977.727,34276.714,1867.471,1757810.5,4377899.0,389072500.0,25209580.0,727252000000.0,52994325.4,1002260000000.0,...,8576126000.0,19182.913,0.0,65591.0,231096.867,112765.0,0.0,0.5,7974.0,2399557.667
5,3300778.909,42846.143,22349.824,117035479.8,292395000.0,3524842000.0,159776400.0,915564800000.0,72911722.0,1248021000000.0,...,8982168000.0,24344.391,0.0,363554.333,231096.867,186725.0,0.0,65961986.5,9551.667,3832135.286
6,3418507.818,51417.286,22802.353,117464934.4,293464700.0,260674400000.0,4347285000.0,1364583000000.0,74422335.2,1251230000000.0,...,8982559000.0,30075.217,0.0,371884.0,231096.867,214513.0,0.0,65961986.5,9795.0,3966438.571


In [41]:
mode = mode_all()
xrng = np.arange(1, mode)
for trend in cols.values():
    print( ymax(trend), trend )
    #print( "Max:", ymax(trend) )
    #print( "Step:", ymax(trend)/mode )
    #print( "Range:", np.arange(0, ymax(trend), ymax(trend)/mode))

0 AMD Opteron performance counters (per core)
1977491715928914.5 Intel Haswell Processor (HSW) (per core)
0 Intel Haswell Processor - Hyper-threaded (per logical core)
0 Intel Nehalem Processor (NHM) (per core)
0 Westmere Uncore (WTM) (per socket)
0 Intel Sandy Brige (SNB) or Ivy Bridge (IVB) Processor (per core)
253409740015.667 Running average power limit
1971806164793004.8 Caching Agent (CBo) for SNB (HSW) (per socket)
300712963864294.8 Power Control Unit for SNB (HSW) (per socket)
1576360837893429.5 Integrated Memory Controller for SNB (HSW) (per socket)
1478162135539331.2 QPI Link Layer for SNB (HSW) (per socket)
1981875537387140.8 Home Agent Unit for SNB (HSW) (per socket)
128158447077131.77 Ring to PCIe Agent for SNB (HSW) (per socket)
2512799741.6470003 Infiniband usage (default)
3711846531857209.0 InfiniBand usage (sw)
35621744981178.6 Infiniband usage (ext)
18014691515.485 Lustre filesystem usage (per mount)
1922091722275.833 Lustre network usage (lnet)
88527365.1 Lustre netw

In [None]:
# choose ranges
n = 25
mode = mode_all()
xrng = np.arange(1, mode)

for trend in cols.values():   # len(cols)
    yticks = np.arange(0, ymax(trend), )
    
    # generate plot
    sns.set("notebook", palette=choice(themes), font_scale=1.5, rc={"lines.linewidth": 2.5})
    plot_n( trend, n)
    
    # label and format plot
    plt.suptitle(    trend, fontsize=25, fontweight='bold' )
    plt.title(       f"Sample of {n} jobs with {mode} cycles of data")
    plt.xticks(      xrng, [ ordinalize(i) for i in xrng ], fontsize='large' )
    plt.xlabel(      "Cycle Data Collected", labelpad=15, fontweight='bold', fontsize='x-large' )
    #plt.ylabel(      f"SUs", labelpad=15, fontweight='bold', fontsize='x-large' )
    
    # save
    plt.savefig( f"./graphs/{trend}_sample.png", bbox='tight')

In [None]:
ax = sns.regplot(x="size", y="total_bill", data=tips,
                 x_estimator=np.mean)

In [None]:
ans = sns.load_dataset("anscombe")
>>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "II"],
...                  scatter_kws={"s": 80},
...                  order=2, ci=None, truncate=True)

In [None]:
sns.set()

# Load the iris dataset
iris = sns.load_dataset(trend[0])

# Plot sepal with as a function of sepal_length across days
g = sns.lmplot(x="sepal_length", y="sepal_width", hue="species",
               truncate=True, size=5, data=iris)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Sepal length (mm)", "Sepal width (mm)")

In [None]:
sns.set(style="whitegrid")

# Load the example iris dataset
diamonds = sns.load_dataset("diamonds")

# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
clarity_ranking = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]
sns.scatterplot(x="carat", y="price",
                hue="clarity", size="depth",
                palette="ch:r=-.2,d=.3_r",
                hue_order=clarity_ranking,
                sizes=(1, 8), linewidth=0,
                data=diamonds, ax=ax)

# Bokeh Visualizations

In [None]:
output_file('bokeh_example.html')

sample = most_cycles[1] #.sample(50)
source = ColumnDataSource(sample)
p = figure()
#p.circle(x='TOTAL_TONS', y='AC_ATTACKING',
#         source=source,
#         size=10, color='green')
#p.title.text = 'Attacking Aircraft and Munitions Dropped'
#p.xaxis.axis_label = 'Tons of Munitions Dropped'
#p.yaxis.axis_label = 'Number of Attacking Aircraft'
#hover = HoverTool()
#hover.tooltips=[
#    ('Attack Date', '@MSNDATE'),
#    ('Attacking Aircraft', '@AC_ATTACKING'),
#    ('Tons of Munitions', '@TOTAL_TONS'),
#    ('Type of Aircraft', '@AIRCRAFT_NAME')
#]
#
#p.add_tools(hover)
#
#show(p)