In [1]:
import numpy as np
import pandas as pd 
import plotly.express as px
import json 
import os

df_f = pd.read_csv('../data/farm/far_data_2024-02-21.csv').fillna(0.0)
df_f.head()

Unnamed: 0,timestamp,cpu_system,boottime,Pool Size Time_P1,mem_free,Missed Buffers_P1,bytes_out,cpu_user,cpu_idle,Pool Size Data_P1,...,RetransSegs_rate,TCPLostRetransmit_rate,TCPForwardRetrans_rate,TotalRetrans,TCPSlowStartRetrans,RPCRetrans,TCPFastRetrans,TCPLostRetransmit,TCPForwardRetrans,RetransSegs
0,2024-02-21 05:36:00,2.1,0.0,0.0,0.0,0.0,0.0,1.006667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-02-21 05:36:15,2.1,0.0,2038.71,0.0,0.0,0.0,1.3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-02-21 05:36:30,1.993333,0.0,2055.49,0.0,0.0,0.0,1.3,0.0,11795.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-02-21 05:36:45,1.806667,0.0,2049.026,0.0,0.0,0.0,1.3,0.0,11892.133333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-02-21 05:37:00,2.1,1705514000.0,2014.643333,0.0,0.0,0.0,1.11,0.0,11854.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dimensionality Reduction (DR) across time domain, feature domain

In [None]:
from pygam import GAM, s
from scipy.interpolate import BSpline, splrep, splev
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def preprocess(df, value_column):
    return df.loc[:, ['timestamp', 'nodeId', value_column]] \
             .pivot_table(index='timestamp', columns='nodeId', values=value_column) \
             .apply(lambda row: row.fillna(0.0), axis=0).T

# DR1 - applying PCA to each variable across the time domain
def apply_pca_to_feature(col_name, df):
    try:
        # pivot: rows -> timestamps, columns -> nodeId
        X = preprocess(df, col_name)

        # convert to cupy array
        # baseline = cp.array(X.values)
        baseline = X.values

        # normalizing the data (demean)
        mean_hat = baseline.mean(axis=0)
        demeaned = baseline - mean_hat

        # standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(demeaned)

        if (X_scaled.shape[0] < 2 or np.all(np.isnan(X_scaled)) or np.all(X_scaled == 0)):
            print(f"Skipping {col_name} due to insufficient data variance.")
            return None

        # apply PCA
        pca = PCA(n_components=3)
        scores = pca.fit_transform(X_scaled)

        explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
        npc = np.sum(explained_variance_ratio_cumsum < 0.9999) + 1
        print(f"Number of principal components: {npc}")
        print(f"PC1:",pca.components_[1], len(pca.components_[1]))
        comp = np.array(pca.components_[1])
        # most influential timestamp for this column
        fc_t = X.columns[np.argmax(comp)]

        P_fin = pd.DataFrame({f"PC{k+1}": scores[:, k] if k < npc else np.nan for k in range(3)})
        P_fin['Measurement'] = X.index
        P_fin['FC_t'] = fc_t

        # print(f"{col_name} done...")

        return P_fin 
    
    except Exception as e:
        print(f"Error processing {col_name}: {e}")
        return None

In [14]:
df_f.loc[:, ['timestamp', 'nodeId', 'cpu_idle']] \
             .pivot_table(index='timestamp', columns='nodeId', values='cpu_idle') \
             .apply(lambda row: row.fillna(0.0), axis=0).T

timestamp,2024-02-21 05:36:00,2024-02-21 05:36:15,2024-02-21 05:36:30,2024-02-21 05:36:45,2024-02-21 05:37:00,2024-02-21 05:37:15,2024-02-21 05:37:30,2024-02-21 05:37:45,2024-02-21 05:38:00,2024-02-21 05:38:15,...,2024-02-21 23:57:30,2024-02-21 23:57:45,2024-02-21 23:58:00,2024-02-21 23:58:15,2024-02-21 23:58:30,2024-02-21 23:58:45,2024-02-21 23:59:00,2024-02-21 23:59:15,2024-02-21 23:59:30,2024-02-21 23:59:45
nodeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
novadaq-far-farm-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,99.600000,99.640000,99.900000,99.900000,99.900000,99.90,99.90,99.900000,99.900000,99.900000
novadaq-far-farm-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.000000,99.603333,99.603333,99.300000,99.300000,99.30,99.30,99.300000,99.860000,99.900000
novadaq-far-farm-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,100.000000,100.000000,99.943333,99.943333,99.900000,99.90,99.90,99.900000,99.900000,99.900000
novadaq-far-farm-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,99.700000,99.726667,99.900000,99.900000,99.900000,99.90,99.90,99.786667,99.786667,99.700000
novadaq-far-farm-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,99.900000,99.900000,99.900000,99.900000,99.900000,99.90,99.62,99.600000,99.600000,99.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
novadaq-far-farm-95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,57.146667,58.100000,58.100000,57.073333,54.660000,59.76,60.60,55.746667,60.340000,56.620000
novadaq-far-farm-96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,59.180000,56.626667,54.120000,53.800000,56.880000,57.10,55.14,55.000000,58.400000,56.100000
novadaq-far-farm-97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.400000,58.646667,59.300000,59.300000,57.046667,56.70,56.70,56.700000,54.460000,54.300000
novadaq-far-farm-98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.413333,54.080000,57.440000,53.800000,53.800000,53.80,53.80,53.800000,54.990000,54.990000


In [15]:
numeric_cols = df_f.drop(columns=['timestamp', 'nodeId']).columns
len(numeric_cols)

46

In [None]:
from concurrent.futures import ThreadPoolExecutor

numeric_cols = df_f.drop(columns=['timestamp', 'nodeId']).columns
P_final = pd.DataFrame()

def process_column(col):
    try:
        P_df = apply_pca_to_feature(col, df_f)
        return pd.DataFrame({
            'Col': col,
            'PC1': P_df['PC1'],
            'FC_t': P_df['FC_t'],
            'Measurement': P_df['Measurement']
        })
    except Exception as e:
        print(f"Error processing {col}: {e}")
        return None

with ThreadPoolExecutor() as executor:
    results = list(executor.map(process_column, numeric_cols))

P_final = pd.concat([df for df in results if df is not None], ignore_index=True)

Skipping mem_shared due to insufficient data variance.
Error processing mem_shared: 'NoneType' object is not subscriptable
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 3
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Skipping RPCRetrans_rate due to insufficient data variance.
Error processing RPCRetrans_rate: 'NoneType' object is not subscriptable
Skipping TotalRetrans_rate due to insufficient data variance.
Error processing TotalRetrans_rate: 'NoneType' object is not subscriptable
Skipping TCPSlowStartRetrans_rate due to insufficient data vari

In [16]:
P_final['Col'].nunique()

31

In [None]:
P_final.to_csv('P_final1.csv')

Second step DR: UMAP

In [6]:
from umap import UMAP
import plotly.express as px

# two-step DR: PCA+UMAP
df_pivot = P_final.pivot(index="Measurement", columns="Col", values="PC1")
reducer = UMAP(n_components=2, random_state=42)
embedding = reducer.fit_transform(df_pivot)
df_pivot['UMAP1'] = embedding[:, 0]
df_pivot['UMAP2'] = embedding[:, 1]
df_pivot['nodeId'] = df_pivot.index 
fig = px.scatter(df_pivot, x='UMAP1', y='UMAP2', hover_data={'nodeId': True})

fig.update_layout(title="UMAP Projection",
                  xaxis_title="UMAP 1",
                  yaxis_title="UMAP 2")
fig.show()


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



Second step DR: tSNE

In [7]:
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE

# two-step DR: PCA+tSNE
df_tsne = df_pivot.drop(columns=['UMAP1', 'UMAP2', 'nodeId'])
tsne = TSNE(n_components=2, random_state=42)
embedding = tsne.fit_transform(df_tsne)

df_pivot['tSNE1'] = embedding[:, 0]
df_pivot['tSNE2'] = embedding[:, 1]

fig = px.scatter(df_pivot, x='tSNE1', y='tSNE2', hover_data={'nodeId': True})

fig.update_layout(title="t-SNE Projection",
                  xaxis_title="t-SNE 1",
                  yaxis_title="t-SNE 2")

fig.show()



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



Second step DR: PCA

In [8]:
import plotly.express as px
import pandas as pd

# DR2 - applying PCA to each feature across one node/component 
def apply_pca(X, n_components=2):
    try:
        baseline = X.values

        # normalizing the data (demean)
        mean_hat = baseline.mean(axis=0)
        demeaned = baseline - mean_hat

        # standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(demeaned)

        # apply PCA
        pca = PCA(n_components=n_components)
        scores = pca.fit_transform(X_scaled)

        explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
        npc = np.sum(explained_variance_ratio_cumsum < 0.9999) + 1
        n_components = scores.shape[1]
        print(f"Number of principal components: {n_components}")
        

        P_fin = pd.DataFrame({f"PC{k+1}": scores[:, k] if k < n_components else np.nan for k in range(3)})
        P_fin['Measurement'] = X.index

        return P_fin

    except Exception as e:
        print(f"Error processing PCA across features: {e}")
        return None

In [9]:
df_pca = df_pivot.drop(columns=['UMAP1', 'UMAP2', 'tSNE1', 'tSNE2', 'nodeId'])
P_df = apply_pca(df_pca)

Number of principal components: 2


In [11]:
P_df

Unnamed: 0,PC1,PC2,PC3,Measurement
0,14.070342,-2.252934,,novadaq-far-farm-06
1,13.898073,-2.905247,,novadaq-far-farm-07
2,14.296000,-4.006608,,novadaq-far-farm-08
3,14.216686,-2.622160,,novadaq-far-farm-09
4,13.550015,-2.195082,,novadaq-far-farm-10
...,...,...,...,...
190,-1.859436,1.464636,,novadaq-far-farm-95
191,1.183160,-0.081462,,novadaq-far-farm-96
192,2.327326,-0.884678,,novadaq-far-farm-97
193,-2.028562,0.979259,,novadaq-far-farm-98


In [12]:
P_df.set_index('Measurement', inplace=True)
df_pca['PC1'] = P_df['PC1']
df_pca['PC2'] = P_df['PC2']
df_pca['nodeId'] = df_pca.index
df_pca

Col,Missed Buffers_P1,Pool Size Data_P1,Pool Size Events_P1,Pool Size Time_P1,boottime,bytes_in,bytes_out,cpu_aidle,cpu_idle,cpu_nice,...,part_max_used,pkts_in,pkts_out,proc_run,proc_total,swap_free,swap_total,PC1,PC2,nodeId
Measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
novadaq-far-farm-06,-24.256130,221.502564,223.920172,223.214097,-22.035973,204.857742,-134.147774,16.808551,127.056379,113.733076,...,5.687101,191.667016,222.143552,97.883786,-70.687349,-7.153418,-7.165049,14.070342,-2.252934,novadaq-far-farm-06
novadaq-far-farm-07,-24.256130,221.502564,223.920172,223.214097,-20.147498,204.986937,-134.194588,-14.473576,125.153507,114.064763,...,-29.307690,192.456964,222.238555,95.922728,-70.837992,-7.153418,-7.165049,13.898073,-2.905247,novadaq-far-farm-07
novadaq-far-farm-08,-24.256130,221.502564,223.920172,223.214097,22.105131,205.294531,-134.363631,-20.789340,75.072762,113.736503,...,24.567064,201.005329,222.180963,98.238420,-86.532456,-7.153418,-7.165049,14.296000,-4.006608,novadaq-far-farm-08
novadaq-far-farm-09,-24.256130,221.502564,223.920172,223.214097,-11.154821,205.093293,-134.319299,11.106108,115.991741,113.735090,...,10.276417,193.845055,222.321312,98.292996,-73.277183,-7.153418,-7.165049,14.216686,-2.622160,novadaq-far-farm-09
novadaq-far-farm-10,-24.256130,221.502564,223.920172,223.214097,-16.779916,204.977334,-134.248648,-15.117958,122.038137,113.735767,...,28.671451,192.887712,222.281727,97.878586,-71.640847,-7.153418,-7.165049,13.550015,-2.195082,novadaq-far-farm-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
novadaq-far-farm-95,-24.256130,-26.012181,-25.092464,-25.399800,11.557186,-27.051852,24.159324,47.763506,14.555998,-23.449652,...,-60.641238,-27.975056,-26.469337,-25.627451,5.713262,-7.153418,-7.165049,-1.859436,1.464636,novadaq-far-farm-95
novadaq-far-farm-96,-24.223770,3.295862,3.401222,3.632307,56.033327,1.961146,-13.428502,25.778116,-11.183348,4.660297,...,-41.043307,5.308201,1.691727,13.884449,-18.087210,-7.153418,-7.165049,1.183160,-0.081462,novadaq-far-farm-96
novadaq-far-farm-97,-22.252491,22.593992,21.756998,22.532210,29.394489,21.438514,-32.353491,-28.771354,-29.200002,23.607357,...,-7.177163,25.577577,20.207151,31.812149,-33.138204,-7.153418,-7.165049,2.327326,-0.884678,novadaq-far-farm-97
novadaq-far-farm-98,-10.655093,-24.494696,-25.051618,-24.958231,-35.669601,-24.990747,19.969042,8.742871,12.635832,-22.770763,...,-36.211488,-26.170819,-26.362507,-22.716321,2.728275,-7.153418,-7.165049,-2.028562,0.979259,novadaq-far-farm-98


In [13]:
fig = px.scatter(df_pca, x="PC1", y="PC2", hover_data={'nodeId': True})

fig.update_layout(title="PCA Projection",
                  xaxis_title="PCA 1",
                  yaxis_title="PCA 2")

fig.show()

In [None]:
df_pivot['PC1'] = df_pca['PC1']
df_pivot['PC2'] = df_pca['PC2']
df_pivot.drop(columns=['nodeId']).to_csv('multiDR_results1.csv')