In [None]:
# ============================================================
# Installazione librerie
# ============================================================

# %pip install psycopg2_binary
# %pip install ipython-sql
# %pip install stats
# %pip install nltk
# import nltk
# nltk.download('stopwords')
# %pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
# %pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
# %pip install numba conda update numba
# %pip install hdbscan
# %pip install xgboost
# %pip install tensorflow

In [None]:
30

In [2]:
# ============================================================
# Notebook setup
# ============================================================

%load_ext sql
%load_ext autoreload
%autoreload 2

random_state = 42
figsize = (15,12)

import os, json
from sqlalchemy import create_engine
from ast import literal_eval
from datetime import datetime
import pandas as pd
import numpy as np
import scipy.stats as st
from util import query, util

# librerie grafiche
import seaborn as sns
sns.set_theme(palette="Set1")
from matplotlib import pyplot as plt

In [3]:
# ============================================================
# Connessione Postgres DB
# ============================================================

connstring = 'postgresql://accguy:accguy@192.168.1.17/htm'
engine = create_engine(connstring)
%sql postgresql://accguy:accguy@192.168.1.17/htm

In [10]:
# %%sql
# SELECT * from pg_stat_activity;

In [11]:
# %%sql
# SELECT pg_cancel_backend(410855)

# Runtime Prediction

Determinare la "categoria" di job dal comportamento del job nella sua prima ora di "vita".
- Basta un'ora per determinare la categoria del job?
- Il fatto che il job fallisca è rilevante?
- Sarà importante valutare il comportamento del job in relazione allo stato delle macchine
- Che subset prendiamo? Come varia il comportamento di un job nella sua prima ora nelle diverse settimane e mesi da Settembre 2021 a Dicembre 2021?

## Preprocessing

xt <= 180

* preprocessing panel data:
    * job, timestep, ram, swap, disk
    * job, ram_1, swap_1, disk_1, ... , ram_n, swap_n, disk_n

In [None]:
PATH = '../data/out.zip'
TIME_SERIES_COLUMNS = ['ram', 'img', 'disk']
TIME_STEP_COLUMN = 't'
START_DATE, END_DATE = '2021-09-01', '2021-09-30'
MIN_RUNTIME = 7200

compression_opts = dict(method='zip', archive_name='out.csv')

if os.path.exists(PATH):
    print("CACHE")
    df = pd.read_csv(PATH)
    for COL in TIME_SERIES_COLUMNS:
        df[COL] = df[COL].apply(lambda x: literal_eval(x))
    df.set_index(pd.to_datetime(df['mint'], unit='s'), inplace=True)
else:
    print("DOWNLOAD")
    df = pd.read_sql(query.jobs_from_date_to_date, engine, params=([START_DATE, MIN_RUNTIME, END_DATE, MIN_RUNTIME, START_DATE, END_DATE, MIN_RUNTIME]))
    df.to_csv(PATH, index=False, compression=compression_opts)   

DOWNLOAD


In [None]:
def aggregate_time_series(df: pd.DataFrame, columns, sliceTime = slice(None, None)):
    return pd.concat([df[sliceTime][col].apply(lambda x: np.mean(x)) for col in columns], axis=1)

aggr_df = aggregate_time_series(df, TIME_SERIES_COLUMNS, slice('2021-09-01', '2021-09-02'))

In [None]:
aggr_df['runtime'] = df[slice('2021-09-01', '2021-09-02')].maxt - df[slice('2021-09-01', '2021-09-02')].mint

In [None]:
def transform_time_series(df: pd.DataFrame, columns, time_column, sliceTime = slice(None, None)):
    new_df = df[sliceTime].explode(columns)
    new_df[time_column] = (np.arange(0, len(new_df)) % 20)
    return new_df.sort_values(by=time_column).reset_index(drop=True)

transformed_df = transform_time_series(df, TIME_SERIES_COLUMNS, TIME_STEP_COLUMN, slice('2021-09-01', '2021-09-02'))

In [None]:
transformed_df['runtime'] = transformed_df['maxt'] - transformed_df['mint']

## Clustering

In [None]:
%%sql
select *
from htjob
limit 10;

## Valutazione modelli

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import Nystroem
from xgboost import XGBRegressor

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
LABELS = ["short", "medium", "long"]

def bin_job_runtime(vect_runtime: pd.Series, lower_bound = 6, upper_bound = 30):
    return pd.cut(vect_runtime / 3600.0, bins = [-float("inf"), lower_bound, upper_bound, len(vect_runtime)], right=False, labels=LABELS)

def confidence_interval(N, acc, alpha=0.05, verbose=False):
    if verbose:
        print(f"\n *** Calcolo intervallo di confidenza per alpha: {alpha}, N: {N} ***\n")
            
    Z = st.norm.ppf(1-alpha/2)
    denom = 2*(N+Z**2)
    p_min = (2 * N * acc + Z**2 - Z * (Z**2 + 4 * N * acc -4 * N * acc**2)**.5)/denom
    p_max = (2 * N * acc + Z**2 + Z * (Z**2 + 4 * N * acc -4 * N * acc**2)**.5)/denom
    
    return p_min, p_max

def eval_model(X, y, model, alpha=0.05, verbose=False):
    binarizer = LabelEncoder()
    y_true = binarizer.fit_transform(bin_job_runtime(y))
    y_pred = binarizer.transform(bin_job_runtime(model.predict(X)))
    
    labels = LABELS
    metrics = ["precision", "recall", "f1_measure"]
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    f1_measure = f1_score(y_true, y_pred, average=None)
    all_classes = pd.Series([precision.mean(), recall.mean(), f1_measure.mean()],  index=metrics)
    
    if verbose:
        print("\n*** Confusion matrix ***\n")
        cf_matrix = confusion_matrix(y_true, y_pred)
        sns.heatmap(cf_matrix, annot=True, cmap = "Blues", fmt="d", xticklabels=labels, yticklabels=labels)
        plt.title('Confusion matrix')
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        plt.show()
        
        print("\n*** Precision, Recall, F1-measure per classe e media ***\n")
        model_stats = pd.concat(
            [pd.DataFrame([precision, recall, f1_measure], index=metrics), all_classes], axis=1)
        model_stats.columns = labels + ["all"]
        print(model_stats)

        print(f"\n*** Calcolo intervallo di confidenza con Confidenza={1-alpha} con N={X.shape[0]} per accuracy e f1-measure ***\n")
        print(f"accuracy: ({accuracy}), intervallo confidenza: {confidence_interval(X.shape[0], accuracy, alpha)}")
        print(f"f1-measure: ({f1_measure.mean()}), intervallo confidenza: {confidence_interval(X.shape[0], f1_measure.mean(), alpha)}")

    return (accuracy, f1_measure.mean())

def eval_difference_two_model(acc1, acc2, N1, N2, alpha=0.05, confidence_level=False):
    print(f"\n*** Valutazione statistica differenza tra modello 1 e modello 2 ***")
    print(f"(acc: {acc1}, N: {N1}) (acc: {acc2}, N: {N2})\n")
    Z = st.norm.ppf(1-alpha/2)
    e1 = 1 - acc1; e2 = 1 - acc2
    d= abs(e2-e1)
    var_d = (e1*(1-e1))/N1 + (e2*(1-e2))/N2
    d_min = d - Z * var_d**0.5
    d_max = d + Z * var_d**0.5
    
    if confidence_level:
        print(f"\n*** Valutazione soglia confidenza che rende significativa la differenza tra i due modelli ***")
        print(f"a: {round(st.norm.sf(d/var_d**0.5) * 2, 2)}\n")
    
    return d_min, d_max

In [None]:
# X, y = transformed_df[TIME_SERIES_COLUMNS + [TIME_STEP_COLUMN]], transformed_df['runtime']
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

preprocessor = ColumnTransformer([
    ('time_series', StandardScaler(), TIME_SERIES_COLUMNS),
    # ('time_step', OneHotEncoder(sparse=False, handle_unknown='ignore'), [TIME_STEP_COLUMN])
], remainder="drop")

In [None]:
X, y = aggr_df[TIME_SERIES_COLUMNS], aggr_df['runtime']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
forest_model = Pipeline([
    ("preprocessor", preprocessor),
    ("forest", RandomForestRegressor())
])

xgboost_model = Pipeline([
    ("preprocessor", preprocessor),
    ("xgboost", XGBRegressor(tree_method='gpu_hist'))
])

forest_model.fit(X_test, y_test)
xgboost_model.fit(X_train, y_train)

In [None]:
eval_model(X_test, y_test, forest_model, verbose=True)

In [None]:
eval_model(X_test, y_test, xgboost_model, verbose=True)

In [None]:
online_model = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_map_nystroem", Nystroem(gamma=.8, random_state=random_state, n_components=300)),
    ("sgd", SGDRegressor(warm_start=True, shuffle=False))
])

def pipeline_partial_fit(pipeline, X, y):
    data_preprocessed = pipeline.named_steps['preprocessor'].fit_transform(X)
    data_transformed = pipeline.named_steps['feature_map_nystroem'].fit_transform(data_preprocessed)
    pipeline.named_steps['sgd'].partial_fit(data_transformed, y)
    
pipeline_partial_fit(online_model, X_train, y_train)

In [None]:
forest_model.score(X_test, y_test)

In [None]:
xgboost_model.named_steps['xgboost'].feature_importances_

In [None]:
xgboost_model.score(X_test, y_test)

In [None]:
pipeline_partial_fit(online_model, X_test, y_test)

In [None]:
X_val = aggregate_time_series(df[slice('2021-09-03', '2021-09-04')], TIME_SERIES_COLUMNS)
y_val = df[slice('2021-09-03', '2021-09-04')].maxt - df[slice('2021-09-03', '2021-09-04')].mint

In [None]:
forest_model.score(X_val, y_val)

In [None]:
xgboost_model.score(X_val, y_val)

In [None]:
online_model.score(X_val, y_val)

In [None]:
%%sql
select *
from htjob jd INNER JOIN hm m ON
    jd.exechosts = m.hn
WHERE jd.eventtimeepoch BETWEEN extract(epoch from m.ts)AND extract(epoch from m.ts)
limit 100;

In [None]:
%%sql
select *
from hj j INNER JOIN hm m ON j.hn = m.hn AND
    j.ts BETWEEN extract(epoch from m.ts) - 180 AND extract(epoch from m.ts) + 180
limit 10;

In [None]:
preds = pd.Series(forest_model.predict(X_val), index=X_val.index)

In [None]:
pd.Series(forest_model.named_steps["forest"].feature_importances_, index=TIME_SERIES_COLUMNS)

In [None]:
pd.concat([X_val, y_val, preds], axis=1)[:50]

In [None]:
forest_model.predict(aggregate_time_series(df[slice('2021-09-03', '2021-09-04')], TIME_SERIES_COLUMNS))