In [28]:
import pandas as pd
from plotnine import *
import glob
import numpy as np

In [31]:
files = glob.glob("peaks/*.csv")
files = [f for f in files if "extended" not in f]
files

['peaks/cubecraft_peaks.csv',
 'peaks/hive_peaks.csv',
 'peaks/runescape_peaks.csv',
 'peaks/hypixel_peaks.csv',
 'peaks/minehut_peaks.csv']

In [48]:
all_peaks_dfs = []
for f in files:
    df = pd.read_csv(f)
    df["service"] = f.split("/")[1].split("_")[0]
    df["startdate"] = pd.to_datetime(df["startdate"])
    df["enddate"] = pd.to_datetime(df["enddate"])
    all_peaks_dfs.append(df)
peaks_df = pd.concat(all_peaks_dfs).drop(["Unnamed: 0", "group"], axis=1).reset_index(drop=True)
peaks_df

Unnamed: 0,timespan,magnitude,drop,shape,startdate,enddate,service,recoverytime
0,0 days 00:10:00,2486.650000,11.558744,-459.145833,2020-08-10 22:50:00,2020-08-10 23:00:00,cubecraft,
1,0 days 00:18:00,1735.485256,9.978598,-48.050000,2020-08-17 14:48:00,2020-08-17 15:06:00,cubecraft,
2,0 days 00:48:00,7793.525000,43.165228,-57.842000,2020-08-20 15:44:00,2020-08-20 16:32:00,cubecraft,
3,0 days 00:44:00,4883.150000,29.227293,-68.822826,2020-09-03 18:30:00,2020-09-03 19:14:00,cubecraft,
4,0 days 01:18:00,2983.826923,17.977305,-66.978152,2020-09-04 17:22:00,2020-09-04 18:40:00,cubecraft,
...,...,...,...,...,...,...,...,...
2827,0 days 00:10:00,1582.753571,12.733152,-242.485732,2021-05-18 16:06:00,2021-05-18 16:16:00,minehut,
2828,0 days 00:08:00,1876.471622,12.092343,-353.829324,2021-05-18 17:40:00,2021-05-18 17:48:00,minehut,
2829,0 days 00:06:00,2150.940715,16.001674,-545.340620,2021-05-18 17:52:00,2021-05-18 17:58:00,minehut,
2830,0 days 00:16:00,2889.835294,25.156926,-269.857616,2021-05-18 18:14:00,2021-05-18 18:30:00,minehut,


In [43]:
def normalize_status(grp, col):
    p95 = grp[col].quantile(0.95)
    grp[col] = grp[col] / p95
    grp.loc[grp[col] > 1.0, col] = 1.0
    
    reference_time = grp["start_time"].min()
    grp["start_time"] = grp["start_time"] - reference_time
    grp["end_time"] = grp["end_time"] - reference_time
    
    return grp

failure_trace = (peaks_df
                 .sort_values("startdate")
                 .reset_index(drop=True))
failure_trace["start_time"] = failure_trace["startdate"].astype(int) // 1e9
failure_trace["end_time"] = failure_trace["enddate"].astype(int) // 1e9
failure_trace = failure_trace.groupby("service").apply(lambda grp: normalize_status(grp, "drop")).reset_index(drop=True)
failure_trace = failure_trace.rename(columns={
    "drop": "status",
})
failure_trace = failure_trace.loc[:, ["start_time", "end_time", "status", "service"]]

services = failure_trace["service"].unique()
for srv in services:
    subset = failure_trace.loc[failure_trace["service"] == srv, :].reset_index(drop=True)
    subset.to_csv(f"../dante traces/{srv}_online_game.csv", index=False)
failure_trace.to_parquet("../correlation_duration_severity/online_games.parquet", index=False)
failure_trace



Unnamed: 0,start_time,end_time,status,service
0,0.0,600.0,0.153076,cubecraft
1,575880.0,576960.0,0.132150,cubecraft
2,838440.0,841320.0,0.571651,cubecraft
3,2058000.0,2060640.0,0.387067,cubecraft
4,2140320.0,2145000.0,0.238079,cubecraft
...,...,...,...,...
2827,239430600.0,239431080.0,0.161454,runescape
2828,239432760.0,239434200.0,0.407771,runescape
2829,239521800.0,239523240.0,0.204937,runescape
2830,239555640.0,239559840.0,0.613183,runescape


In [38]:
def get_iat_and_dur_med(df, start, end):
    t1 = df[start].sort_values().reset_index(drop=True)
    t2 = np.roll(t1, -1)
    t2[-1] = 0
    iat = t2 - t1
    iat = iat[iat > 0]
    iat_stats = iat.quantile([0.5, 0.95]).to_list()
    dur = df[end] - df[start]
    dur_stats = dur.quantile([0.5, 0.95]).to_list()
    return pd.DataFrame({
        "metric": ["iat", "iat", "dur", "dur"],
        "quantile": ["median", "tail", "median", "tail"],
        "quantity": iat_stats + dur_stats
    })

iat_dur_med_list = []

for service in failure_trace['service'].unique():
    partial_df = failure_trace[failure_trace['service'] == service].reset_index(drop=True)
    
    iat_dur_med_df = get_iat_and_dur_med(partial_df, "start_time", "end_time")
    iat_dur_med_df["vendor_cat"] = service
    iat_dur_med_list.append(iat_dur_med_df)

iat_dur_med_df = pd.concat(iat_dur_med_list).reset_index(drop=True)
iat_dur_med_df.to_parquet("../iat_dur_med/online_games.parquet")
iat_dur_med_df

Unnamed: 0,metric,quantile,quantity,vendor_cat
0,iat,median,208680.0,cubecraft
1,iat,tail,2090514.0,cubecraft
2,dur,median,1140.0,cubecraft
3,dur,tail,5112.0,cubecraft
4,iat,median,220440.0,hive
5,iat,tail,1155672.0,hive
6,dur,median,480.0,hive
7,dur,tail,7032.0,hive
8,iat,median,164160.0,hypixel
9,iat,tail,1008600.0,hypixel


In [40]:
peaks_df["hourofweek"] = peaks_df["startdate"].dt.dayofweek*24 + peaks_df["startdate"].dt.hour
weekly_failures = (peaks_df.groupby(["service", "hourofweek"]).count().reset_index()
                   .rename(columns={"drop": "num_failures", "service": "vendor"})
                   [["vendor", "hourofweek", "num_failures"]])
weekly_failures.to_parquet("../weekly_failures/online_games.parquet")
weekly_failures

Unnamed: 0,vendor,hourofweek,num_failures
0,cubecraft,10,1
1,cubecraft,11,1
2,cubecraft,12,1
3,cubecraft,13,1
4,cubecraft,14,1
...,...,...,...
433,runescape,163,11
434,runescape,164,13
435,runescape,165,13
436,runescape,166,6
