# Overview

Processing/Pruning the following block to reduce its size for data analysis.

- w
- io
- df

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

In [None]:
sns.set_theme(style="whitegrid", )
custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    "figure.figsize":(15,15)
}
sns.set_theme(style="ticks", palette="pastel", rc=custom_params)

## w block

In [None]:
df_w = pd.read_feather("w_block.arrow")

df_w

looking for col that can be removed

In [None]:
for col in df_w.columns:
    unique_vals = df_w[col].unique()
    print(f"len(unique_vals[{col}]) = {len(unique_vals)}")
    if len(unique_vals) < 10:
        print(f"\t{unique_vals}")

all 15 cols are solid ones that worth investigating, continue with mapping data type

In [None]:
df_w.dtypes

In [None]:
df_w[~df_w["TTY"].str.contains("pts/")]["TTY"].unique()

we can trim the TTY

In [None]:
df_w["TTY"].apply(lambda x: int(x.replace("pts/", "")))

In [None]:
df_w["TTY"] = df_w["TTY"].apply(lambda x: int(x.replace("pts/", "")))

we should be able to convert IDLE time

In [None]:
def translate_idle_time(x):
    """
    cast to seconds
    """
    idle_time = -1.0
    if "days" in x:
        idle_time = float(x.replace("days", "")) * 24 * 3600
    elif "m" in x:
        idle_time = float(x[:-1].split(":")[0]) * 60 + float(x[:-1].split(":")[1])
    elif "s" in x:
        idle_time = float(x[:-1])
    else:
        idle_time = float(x.split(":")[0]) * 3600 + float(x.split(":")[1]) * 60
    return idle_time

df_w["IDLE"].apply(translate_idle_time)

In [None]:
df_w["IDLE"] = df_w["IDLE"].apply(translate_idle_time)

In [None]:
df_w.rename(columns={"IDLE": "IDLE/sec"}, inplace=True)

In [None]:
df_w

now JCPU

In [None]:
df_w["JCPU"].apply(translate_idle_time)

In [None]:
df_w["JCPU"] = df_w["JCPU"].apply(translate_idle_time)

In [None]:
df_w.rename(columns={"JCPU": "JCPU/sec"}, inplace=True)

In [None]:
def translate_time(x):
    """
    cast to seconds
    """
    time_val = np.nan
    try:
        if "days" in x:
            time_val = float(x.replace("days", "")) * 24 * 3600
        elif "m" in x:
            time_val = float(x[:-1].split(":")[0]) * 60 + float(x[:-1].split(":")[1])
        elif "s" in x:
            time_val = float(x[:-1])
        elif ":" in x:
            time_val = float(x.split(":")[0]) * 3600 + float(x.split(":")[1]) * 60
    except:
        # the entry is corrupted
        pass

    return time_val

df_w["PCPU"] = df_w["PCPU"].apply(translate_time)

there are many entries corrupted, and we have to drop them

In [None]:
df_w

In [None]:
df_w.dropna()

lost a few k records, worth doing anyway

In [None]:
df_w.dropna(inplace=True)

In [None]:
df_w.rename(columns={"PCPU": "PCPU/sec"}, inplace=True)

In [None]:
df_w

In [None]:
df_w.dtypes

In [None]:
df_w.reset_index(inplace=True, drop=True)

In [None]:
df_w.to_feather("tmp/w_block_pruned.arrow", compression="lz4")