In [1]:
import pandas
import numpy as np

df = pandas.read_csv("meta_scan_csvs/kitchen/kitchen_5.csv")
# drop ts column, not useful
df = df.drop('ts', axis=1)
df

Unnamed: 0,value,counter_name,t_s
0,2.132433e+01,application_prediction_milliseconds,0.000000
1,1.000000e+00,stale_frames_per_second,0.000198
2,0.000000e+00,last_used_foveation_level,0.000201
3,1.000000e+00,application_layer_count,0.000203
4,7.878000e+03,app_vss_mb,0.000205
...,...,...,...
8179,5.046908e+01,% Linear Filtered,29.870824
8180,0.000000e+00,% Anisotropic Filtered,29.870824
8181,6.768648e+01,Avg Preemption Delay,29.870825
8182,2.240133e+10,cpu-cycles,29.877452


In [2]:
# STEP 1: restructure data frame before any data analysis/visualization

# STEP 1a: because time signatures are stored down to the microsecond, we don't have values for multiple features at any time signature
# bin/round first to reduce sparsity and memory usage; need to round time signature to a less precise value, but how many decimal places?
# calculate mean time gap
np.diff(df.sort_values("t_s")["t_s"].unique()).mean()

0.003652819515825492

In [3]:
# mean time gap between samples is ≈ 0.0008348 seconds, round to the nearest millisecond (3 places)
# update: tried rounding to 3 places, but data was too sparse, switching to 2 places
df["Time (s)"] = df["t_s"].round(2)

# STEP 1b: columns should be feature names, currently all features are lumped as rows under counter_name
features_df = (df.pivot_table(
            index="Time (s)",
            columns="counter_name",
            values="value",
            aggfunc="mean", # if multiple rows have same counter_name and time bin, mean value is used
            observed=True)
          .sort_index())
features_df = features_df.reset_index()
features_df.columns.name = None
features_df.head()

Unnamed: 0,Time (s),% Anisotropic Filtered,% Linear Filtered,% Nearest Filtered,% Texture Fetch Stall,% Texture L2 Miss,% Time Compute,% Time Shading Fragments,% Time Shading Vertices,% Vertex Fetch Stall,...,display_refresh_rate,gpu_frequency_mhz,gpu_level,gpu_util,guardian_gpu_ms,last_used_foveation_level,mem_frequency_mhz,screen_tears_per_second,stale_frames_per_second,timewarp_gpu_ms
0,0.0,,,,,,,,,,...,,,,,,0.0,,,1.0,
1,0.01,,,,,,,,,,...,72.0,545.0,2.0,27.249703,0.0,0.0,3196.0,0.0,0.0,2.517901
2,0.04,,,,,,,,,,...,,,,,,0.0,,,0.0,
3,0.24,0.0,62.306778,37.765106,16.398287,25.945395,0.0,93.928055,6.071945,0.665943,...,,,,,,,,,,
4,0.34,0.0,62.802967,37.124123,15.947389,26.412683,0.0,94.104088,5.89591,0.620139,...,,,,,,,,,,


In [4]:
# STEP 1c: fix sparsity
# NaNs are the result of some features only being recorded every second or so, rather than every ms
# forward fill for each feature, then backward fill so beginning rows aren't NaNs
features_df = features_df.replace(0, np.nan)
features_df = features_df.dropna(axis=1, how='all')
features_df = features_df.ffill()
features_df = features_df.bfill()
features_df = features_df.dropna(axis=1, how='any')
features_df
features_df.to_csv("kitchen_5_cleaned.csv", index=False)

In [None]:
import matplotlib.pyplot as plt
import math

plt.style.use('ggplot')

ncols = 4
nrows = math.ceil(len(features_df.columns) / ncols)

fig, axes = plt.subplots(
    nrows=nrows, 
    ncols=ncols, 
    figsize=(16, 54),
    sharex=True
)

axes = axes.flatten()

for i, col in enumerate(features_df.columns[1:]):
    ax = axes[i]
    ax.plot(features_df["Time (s)"], features_df[col], linewidth=1.2, alpha=0.9)
    ax.set_title(col, fontsize=9)
    ax.set_xlabel("Time (s)")
    ax.set_ylabel("Value")
    ax.grid(True, alpha=0.3)

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
# normalize and fill again
norm = features_df.set_index("Time (s)")
norm = (norm - norm.mean())/norm.std()
norm = norm.dropna(axis=1, how='any')
norm = norm.ffill()
norm = norm.bfill()
norm

In [None]:
# plot on the same graph
axes2 = norm.plot(figsize=(12,12), title="Performance Counters Over Time")
axes2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')