In [1]:
import glob
import re
import os
import os.path
import tarfile

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
GDRIVE = '/content/drive'
DATA_DIR = '{}/MyDrive/ime/allan-matheus-mestrado/data/'.format(GDRIVE)
TEMP_DIR = '/content/data'

In [9]:
from google.colab import drive
drive.mount(GDRIVE)

Mounted at /content/drive


In [10]:
os.makedirs(TEMP_DIR, exist_ok=True)

In [11]:
with tarfile.open(os.path.join(DATA_DIR, 'csv-2019-12.tar.gz')) as archive:
    archive.extractall(TEMP_DIR)

In [12]:
!ls {TEMP_DIR}/csv-2019-12/ | head -n5

std-blast-instance-000002d3-5de7b3499975b01756272e65.out.monitor.csv
std-blast-instance-000002d4-5de7b34e9975b01756272e67.out.monitor.csv
std-blast-instance-000002d5-5de7bee49975b01756272e69.out.monitor.csv
std-blast-instance-000002d6-5de7c56a9975b01756272e6b.out.monitor.csv
std-blast-instance-000002d7-5de7d4779975b01756272e6d.out.monitor.csv


## Atributos derivados para classificação bibária

Gera os seguintes atributos:
 - janelas de deltas dos monitoramentos
 - valores instântaneos (anteriores)
 - atributo alvo: classe binária
 - metadados: application e task_id
 

In [14]:
# exemplo:
x = pd.DataFrame({'x': [10,30,20,90,50,40,80,70,80,60]})
x['x-1'] = x['x'].shift(1)
x['diff'] = x['x'].diff(1)
x['x-diff-3'] = x['diff'].shift(3)
x['x-diff-2'] = x['diff'].shift(2)
x['x-diff-1'] = x['diff'].shift(1)
x['x-up'] = (x['diff'] >= 0).astype(np.float)
# x['x-up+1o'] = x['x-up'].shift(-1)
x['x-up+1'] = (x['x'].diff(2) >= 0).astype(np.float)
# x['x-up+2o'] = x['x-up'].shift(-2)
x['x-up+2'] = (x['x'].diff(3) >= 0).astype(np.float)
x['x-baseline'] = x['x-up'].shift(1)
x['x-baseline+1'] = x['x-baseline'].shift(-1)
x['x-baseline+2'] = x['x-baseline'].shift(-2)
x.pop('x')
x.pop('diff')
x #.dropna()

Unnamed: 0,x-1,x-diff-3,x-diff-2,x-diff-1,x-up,x-up+1,x-up+2,x-baseline,x-baseline+1,x-baseline+2
0,,,,,0.0,0.0,0.0,,0.0,1.0
1,10.0,,,,1.0,0.0,0.0,0.0,1.0,0.0
2,30.0,,,20.0,0.0,1.0,0.0,1.0,0.0,1.0
3,20.0,,20.0,-10.0,1.0,1.0,1.0,0.0,1.0,0.0
4,90.0,20.0,-10.0,70.0,0.0,1.0,1.0,1.0,0.0,0.0
5,50.0,-10.0,70.0,-40.0,0.0,0.0,1.0,0.0,0.0,1.0
6,40.0,70.0,-40.0,-10.0,1.0,1.0,0.0,0.0,1.0,0.0
7,80.0,-40.0,-10.0,40.0,0.0,1.0,1.0,1.0,0.0,1.0
8,70.0,-10.0,40.0,-10.0,1.0,1.0,1.0,0.0,1.0,
9,80.0,40.0,-10.0,10.0,0.0,0.0,0.0,1.0,,


In [None]:
state = [1,2]
state.extend((pd.DataFrame({'x': [1,2,3,4,5,6,7,6,9]})
    .diff()
    .iloc[-3:]['x']
    #.shift(3).dropna()
    .values
))
state

[1, 2, 1.0, -1.0, 3.0]

In [5]:
x = pd.DataFrame({'x': [10,30,20,90,50,40,80,70,80,60]})
x['diff'] = x.x.diff()
x['diff-2'] = x.x.diff(2)
x['diff-3'] = x.x.diff(3)
x

Unnamed: 0,x,diff,diff-2,diff-3
0,10,,,
1,30,20.0,,
2,20,-10.0,10.0,
3,90,70.0,60.0,80.0
4,50,-40.0,30.0,20.0
5,40,-10.0,-50.0,20.0
6,80,40.0,30.0,-10.0
7,70,-10.0,30.0,20.0
8,80,10.0,0.0,40.0
9,60,-20.0,-10.0,-20.0


In [15]:
def transform(df, windows=3):
    """Create derived features"""
    original_columns = list(df.columns)
    # add window of deltas
    for column in df.columns:
        df['%s-1' % column] = df[column].shift()
        diff = df[column].diff(1)
        for i in reversed(range(1, 1 + windows)):
            df['%s-diff-%d' % (column, i)] = diff.shift(i)
    # add target (up?) and baseline prediction
    for target in original_columns:
        up = (df[target].diff(1) >= 0).astype(np.float)
        df['%s-up' % target] = up
        df['%s-baseline' % target] = up.shift(1)
        for i in list(range(1, windows)) + [20, 100, 200]:
            # df['%s-up+%d' % (target, i)] = df['%s-up' % target].shift(-i)
            # df['%s-baseline+%d' % (target, i)] = df['%s-baseline' % target].shift(-i)
            df['%s-up+%d' % (target, i)] = (df[target].diff(i + 1) >= 0).astype(np.float)
    # drop original columns
    for column in original_columns:
        df.pop(column)
    df.dropna(inplace=True)
    return df

In [16]:
# mesmo exemplo (para verificação)
transform(pd.DataFrame({'x': [10,30,20,90,50, 40,80,70,80,60]}))

Unnamed: 0,x-1,x-diff-3,x-diff-2,x-diff-1,x-up,x-baseline,x-up+1,x-up+2,x-up+20,x-up+100,x-up+200
4,90.0,20.0,-10.0,70.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
5,50.0,-10.0,70.0,-40.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,40.0,70.0,-40.0,-10.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
7,80.0,-40.0,-10.0,40.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
8,70.0,-10.0,40.0,-10.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
9,80.0,40.0,-10.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# for extracting medatada from CSV filename
regex = re.compile(r'std-([^-]+)-instance-([^-]+)-([^-]+).out.monitor.csv')

In [18]:
def read_csv(filepath):
    """Read CSV and add metadata columns."""
    # parse metadata
    basename = os.path.basename(filepath)
    match = re.match(regex, basename)
    if not match:
        raise ValueError('bad formated filename "{}"'.format(filepath))
    app, vm, task = match.groups()
    df = pd.read_csv(filepath)
    # ignore memory due to ballooing
    df.pop('memory')
    # ifnore io parts, use only total
    df.pop('io_read')
    df.pop('io_write')
    # add derived features
    df = transform(df)
    # add metadata
    df['application'] = app
    # ignoring VM id
    #df['vm_id'] = vm
    df['task_id'] = task
    return df

## Gerar dataset

In [19]:
source = os.path.join(TEMP_DIR, 'csv-2019-12', 'std-*.out.monitor.csv')
files = glob.glob(source)
dfs = (read_csv(csv) for csv in files)
fulldf = pd.concat(dfs, ignore_index=True)
fulldf.head()

Unnamed: 0,cpu-1,cpu-diff-3,cpu-diff-2,cpu-diff-1,io_total-1,io_total-diff-3,io_total-diff-2,io_total-diff-1,cpu-up,cpu-baseline,cpu-up+1,cpu-up+2,cpu-up+20,cpu-up+100,cpu-up+200,io_total-up,io_total-baseline,io_total-up+1,io_total-up+2,io_total-up+20,io_total-up+100,io_total-up+200,application,task_id
0,74.62,0.0,-19.03,-6.35,8128739.56,0.0,3573418.67,4555320.89,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,hpl,5de729199975b01756272e47
1,74.84,-19.03,-6.35,0.22,11510518.52,3573418.67,4555320.89,3381778.96,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,hpl,5de729199975b01756272e47
2,67.21,-6.35,0.22,-7.63,10625276.84,4555320.89,3381778.96,-885241.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hpl,5de729199975b01756272e47
3,57.7,0.22,-7.63,-9.51,5230676.28,3381778.96,-885241.68,-5394600.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,hpl,5de729199975b01756272e47
4,34.23,-7.63,-9.51,-23.47,4585329.43,-885241.68,-5394600.56,-645346.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,hpl,5de729199975b01756272e47


## Exportar dataset com atributos derivados

In [20]:
destination = os.path.join(DATA_DIR, 'binclf-new.csv')
fulldf.to_csv(destination, index=False)

### Consulta por aplicação e execução

In [None]:
fulldf[fulldf['application'] == 'blast'].head()

Unnamed: 0,cpu-1,cpu-diff-3,cpu-diff-2,cpu-diff-1,io_total-1,io_total-diff-3,io_total-diff-2,io_total-diff-1,cpu-up,cpu-baseline,cpu-up+1,cpu-baseline+1,cpu-up+2,cpu-baseline+2,io_total-up,io_total-baseline,io_total-up+1,io_total-baseline+1,io_total-up+2,io_total-baseline+2,application,task_id
57,61.28,-57.1,32.03,-13.65,5342094.22,-8192.0,2754218.67,2587875.55,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,blast,5de937781b1d9b3336951781
58,55.73,32.03,-13.65,-5.55,6013231.41,2754218.67,2587875.55,671137.19,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,blast,5de937781b1d9b3336951781
59,62.51,-13.65,-5.55,6.78,6193253.14,2587875.55,671137.19,180021.73,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,blast,5de937781b1d9b3336951781
60,71.8,-5.55,6.78,9.29,6675148.38,671137.19,180021.73,481895.24,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,blast,5de937781b1d9b3336951781
61,70.6,6.78,9.29,-1.2,7521348.13,180021.73,481895.24,846199.75,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,blast,5de937781b1d9b3336951781


In [None]:
query_app = fulldf['application'] == 'blast'
query_task = fulldf['task_id'] == '5deb0df01b1d9b3336951805'
sample = fulldf[query_app & query_task]
sample.head()

Unnamed: 0,cpu-1,cpu-diff-3,cpu-diff-2,cpu-diff-1,io_total-1,io_total-diff-3,io_total-diff-2,io_total-diff-1,cpu-up,cpu-baseline,cpu-up+1,cpu-baseline+1,cpu-up+2,cpu-baseline+2,io_total-up,io_total-baseline,io_total-up+1,io_total-baseline+1,io_total-up+2,io_total-baseline+2,application,task_id
11577,60.39,-58.1,33.37,-14.88,4149361.78,-14098944.0,4696917.33,-555747.55,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,blast,5deb0df01b1d9b3336951805
11578,39.46,33.37,-14.88,-20.93,5692624.59,4696917.33,-555747.55,1543262.81,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,blast,5deb0df01b1d9b3336951805
11579,39.79,-14.88,-20.93,0.33,6123248.2,-555747.55,1543262.81,430623.61,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,blast,5deb0df01b1d9b3336951805
11580,61.23,-20.93,0.33,21.44,6239482.73,1543262.81,430623.61,116234.53,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,blast,5deb0df01b1d9b3336951805
11581,63.74,0.33,21.44,2.51,7482451.58,430623.61,116234.53,1242968.85,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,blast,5deb0df01b1d9b3336951805
