In [2]:
import numpy as np
import pandas as pd 
import plotly.express as px
import json 
import os

df_f = pd.read_csv('../data/farm/far_data_2024-02-21.csv').fillna(0.0)
df_f.head()

Unnamed: 0,timestamp,cpu_system,boottime,Pool Size Time_P1,mem_free,Missed Buffers_P1,bytes_out,cpu_user,cpu_idle,Pool Size Data_P1,...,RetransSegs_rate,TCPLostRetransmit_rate,TCPForwardRetrans_rate,TotalRetrans,TCPSlowStartRetrans,RPCRetrans,TCPFastRetrans,TCPLostRetransmit,TCPForwardRetrans,RetransSegs
0,2024-02-21 05:36:00,2.1,0.0,0.0,0.0,0.0,0.0,1.006667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-02-21 05:36:15,2.1,0.0,2038.71,0.0,0.0,0.0,1.3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-02-21 05:36:30,1.993333,0.0,2055.49,0.0,0.0,0.0,1.3,0.0,11795.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-02-21 05:36:45,1.806667,0.0,2049.026,0.0,0.0,0.0,1.3,0.0,11892.133333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-02-21 05:37:00,2.1,1705514000.0,2014.643333,0.0,0.0,0.0,1.11,0.0,11854.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dimensionality Reduction (DR) across feature domain, time domain

In [None]:
from pygam import GAM, s
from scipy.interpolate import BSpline, splrep, splev
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def preprocess(df, timestamp):
    return df[df['timestamp']==timestamp].set_index('nodeId').drop(columns='timestamp')
    
# DR1: Applying PCA to each timestamp across features
def apply_pca_to_time(ts, df):
    try:
        # pivot: rows -> features, columns -> nodeId
        X = preprocess(df, ts)

        # convert to cupy array
        # baseline = cp.array(X.values)
        baseline = X.values

        # normalizing the data (demean)
        mean_hat = baseline.mean(axis=0)
        demeaned = baseline - mean_hat

        # standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(demeaned)

        # if (X_scaled.shape[0] < 2 or np.all(np.isnan(X_scaled)) or np.all(X_scaled == 0)):
        if X_scaled.shape[0] < 2 or np.all(np.isnan(X_scaled)):
            print(f"Skipping {ts} due to insufficient data variance.")
            return None

        # apply PCA
        pca = PCA(n_components=3)
        scores = pca.fit_transform(X_scaled)

        explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
        npc = np.sum(explained_variance_ratio_cumsum < 0.9999) + 1
        print(f"Number of principal components: {npc}")
        # print(f"PC1:",pca.components_[1], len(pca.components_[1]))
        comp = np.array(pca.components_[1])
        # most influential timestamp for this column
        fc_f = X.columns[np.argmax(comp)]

        P_fin = pd.DataFrame({f"PC{k+1}": scores[:, k] if k < npc else np.nan for k in range(3)})
        P_fin['Measurement'] = X.index
        P_fin['FC_f'] = fc_f

        # print(f"{col_name} done...")

        return P_fin 

    except Exception as e:
        print(f"Error processing PCA across features: {e}")
        return None

Some timestamps only return one row (node)

In [3]:
df_f[df_f['timestamp']=='2024-02-21 05:37:00']

Unnamed: 0,timestamp,cpu_system,boottime,Pool Size Time_P1,mem_free,Missed Buffers_P1,bytes_out,cpu_user,cpu_idle,Pool Size Data_P1,...,RetransSegs_rate,TCPLostRetransmit_rate,TCPForwardRetrans_rate,TotalRetrans,TCPSlowStartRetrans,RPCRetrans,TCPFastRetrans,TCPLostRetransmit,TCPForwardRetrans,RetransSegs
4,2024-02-21 05:37:00,2.1,1705514000.0,2014.643333,0.0,0.0,0.0,1.11,0.0,11854.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Only getting timestamps where we have readings from all 195 nodes

In [6]:
num_nodes = 195
valid_ts = valid_ts = df_f.groupby('timestamp').filter(lambda x: x['nodeId'].nunique() == num_nodes)['timestamp'].unique()
print('valid timestamps:', len(valid_ts))
print(valid_ts)

valid timestamps: 1890
['2024-02-21 16:07:30' '2024-02-21 16:07:45' '2024-02-21 16:08:00' ...
 '2024-02-21 23:59:15' '2024-02-21 23:59:30' '2024-02-21 23:59:45']


DR1: Applying PCA across features for each timestamp

In [16]:
timestamp = '2024-02-21 16:07:45'
df_f[df_f['timestamp']==timestamp].set_index('nodeId').drop(columns='timestamp')

Unnamed: 0_level_0,cpu_system,boottime,Pool Size Time_P1,mem_free,Missed Buffers_P1,bytes_out,cpu_user,cpu_idle,Pool Size Data_P1,pkts_out,...,RetransSegs_rate,TCPLostRetransmit_rate,TCPForwardRetrans_rate,TotalRetrans,TCPSlowStartRetrans,RPCRetrans,TCPFastRetrans,TCPLostRetransmit,TCPForwardRetrans,RetransSegs
nodeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
novadaq-far-farm-92,0.300000,1.705517e+09,2057.004000,1672380.0,0.0,128234.700000,0.200000,93.100000,11910.280000,921.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-18,1.940000,1.705517e+09,2033.570000,2803696.0,0.0,133441.976667,1.306667,42.653333,11714.593333,975.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-118,1.426667,1.705513e+09,2036.430000,3245016.0,0.0,100728.280000,0.626667,53.626667,11743.040000,952.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-130,0.400000,1.705517e+09,2003.686667,1305232.0,0.0,118329.970000,0.300000,92.400000,11792.046667,912.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-111,1.700000,1.705517e+09,2018.370667,2089808.0,0.0,169360.980000,0.800000,53.100000,11663.933333,962.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
novadaq-far-farm-82,1.700000,1.705513e+09,2001.786667,2931456.0,0.0,136155.450000,0.800000,58.080000,11654.373333,961.58,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-93,1.900000,1.705513e+09,2052.254000,2822044.0,0.0,136319.590000,1.200000,60.100000,11767.266667,985.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-196,1.333333,1.705517e+09,2053.529333,3692072.0,0.0,116683.590000,0.800000,53.580000,11655.680000,968.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
novadaq-far-farm-61,0.400000,1.705517e+09,2038.066667,537115.2,0.0,157552.938000,0.100000,93.400000,11678.320000,926.80,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from concurrent.futures import ThreadPoolExecutor

P_final = pd.DataFrame()

def process_ts(ts):
    try:
        P_df = apply_pca_to_time(ts, df_f)
        return pd.DataFrame({
            'Col': ts,
            'PC1': P_df['PC1'],
            'FC_f': P_df['FC_f'],
            'Measurement': P_df['Measurement']
        })
    except Exception as e:
        print(f'Error processing {ts}: {e}')
        return None

with ThreadPoolExecutor() as executer:
    results = list(executer.map(process_ts, valid_ts))

P_final = pd.concat([df for df in results if df is not None], ignore_index=True)

Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of prin

  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var


Number of principal components: 1
Number of principal components: 4
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 4
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1
Number of principal components: 4
Number of principal components: 3
Number of principal components: 1
Number of principal components: 1
Number of principal components: 1


  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var


Number of principal components: 1Number of principal components: 1

Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 1


  explained_variance_ratio_ = explained_variance_ / total_var


Number of principal components: 4
Number of principal components: 4
Number of principal components: 3
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4Number of principal components: 4

Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of principal components: 4
Number of prin

In [None]:
P_final.to_csv('P_final2.csv')

In [9]:
P_final

Unnamed: 0,Col,PC1,FC_f,Measurement
0,2024-02-21 16:07:30,-3.482667,cpu_system,novadaq-far-farm-50
1,2024-02-21 16:07:30,-2.199933,cpu_system,novadaq-far-farm-27
2,2024-02-21 16:07:30,-0.126631,cpu_system,novadaq-far-farm-58
3,2024-02-21 16:07:30,0.127282,cpu_system,novadaq-far-farm-87
4,2024-02-21 16:07:30,-1.459689,cpu_system,novadaq-far-farm-153
...,...,...,...,...
368545,2024-02-21 23:59:45,-0.440991,disk_free,novadaq-far-farm-167
368546,2024-02-21 23:59:45,-1.274154,disk_free,novadaq-far-farm-30
368547,2024-02-21 23:59:45,-0.985929,disk_free,novadaq-far-farm-19
368548,2024-02-21 23:59:45,-0.099201,disk_free,novadaq-far-farm-191


Second step DR: UMAP

In [10]:
from umap import UMAP
import plotly.express as px

# two-step DR: PCA+UMAP
# applying UMAP to each node in variable domain to get nodes (Measurements) in 2D
df_pivot = P_final.pivot(index="Measurement", columns="Col", values="PC1")
reducer = UMAP(n_components=2, random_state=42)
embedding = reducer.fit_transform(df_pivot)
df_pivot['UMAP1'] = embedding[:, 0]
df_pivot['UMAP2'] = embedding[:, 1]
df_pivot['nodeId'] = df_pivot.index 
fig = px.scatter(df_pivot, x='UMAP1', y='UMAP2', hover_data={'nodeId': True})

fig.update_layout(title="UMAP Projection",
                  xaxis_title="UMAP 1",
                  yaxis_title="UMAP 2")
fig.show()

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Second step DR: tSNE

In [11]:
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE

# two-step DR: PCA+tSNE
df_tsne2 = df_pivot.drop(columns=['UMAP1', 'UMAP2', 'nodeId'])
tsne = TSNE(n_components=2, random_state=42)
embedding = tsne.fit_transform(df_tsne2)

df_pivot['tSNE1'] = embedding[:, 0]
df_pivot['tSNE2'] = embedding[:, 1]

fig = px.scatter(df_pivot, x='tSNE1', y='tSNE2', hover_data={'nodeId': True})

fig.update_layout(title="t-SNE Projection",
                  xaxis_title="t-SNE 1",
                  yaxis_title="t-SNE 2")

fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



Second step DR: PCA

In [12]:
import plotly.express as px
import pandas as pd

# DR2 - applying PCA to each feature across one node/component 
def apply_pca(X, n_components=2):
    try:
        baseline = X.values

        # normalizing the data (demean)
        mean_hat = baseline.mean(axis=0)
        demeaned = baseline - mean_hat

        # standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(demeaned)

        # apply PCA
        pca = PCA(n_components=n_components)
        scores = pca.fit_transform(X_scaled)

        explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
        npc = np.sum(explained_variance_ratio_cumsum < 0.9999) + 1
        n_components = scores.shape[1]
        print(f"Number of principal components: {n_components}")
        

        P_fin = pd.DataFrame({f"PC{k+1}": scores[:, k] if k < n_components else np.nan for k in range(3)})
        P_fin['Measurement'] = X.index

        return P_fin

    except Exception as e:
        print(f"Error processing PCA across features: {e}")
        return None

In [13]:
df_pca = df_pivot.drop(columns=['UMAP1', 'UMAP2', 'tSNE1', 'tSNE2', 'nodeId'])
P_df = apply_pca(df_pca)
P_df

Number of principal components: 2


Unnamed: 0,PC1,PC2,PC3,Measurement
0,163.005566,-23.041363,,novadaq-far-farm-06
1,158.100508,-20.848159,,novadaq-far-farm-07
2,162.805725,-21.531884,,novadaq-far-farm-08
3,163.021252,-22.132281,,novadaq-far-farm-09
4,158.064192,-25.394730,,novadaq-far-farm-10
...,...,...,...,...
190,5.413388,4.231181,,novadaq-far-farm-95
191,5.228870,5.424593,,novadaq-far-farm-96
192,-2.040937,-6.190887,,novadaq-far-farm-97
193,-2.570028,-3.034026,,novadaq-far-farm-98


In [14]:
P_df.set_index('Measurement', inplace=True)
df_pca['PC1'] = P_df['PC1']
df_pca['PC2'] = P_df['PC2']
df_pca['nodeId'] = P_df.index
df_pca

Col,2024-02-21 16:07:30,2024-02-21 16:07:45,2024-02-21 16:08:00,2024-02-21 16:08:15,2024-02-21 16:08:30,2024-02-21 16:08:45,2024-02-21 16:09:00,2024-02-21 16:09:15,2024-02-21 16:09:30,2024-02-21 16:09:45,...,2024-02-21 23:58:15,2024-02-21 23:58:30,2024-02-21 23:58:45,2024-02-21 23:59:00,2024-02-21 23:59:15,2024-02-21 23:59:30,2024-02-21 23:59:45,PC1,PC2,nodeId
Measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
novadaq-far-farm-06,14.204855,14.529020,14.595754,14.527076,14.695802,14.819769,14.771393,15.053273,15.005570,14.985103,...,14.518033,14.260034,14.057485,14.001678,14.051508,14.004384,13.249994,163.005566,-23.041363,novadaq-far-farm-06
novadaq-far-farm-07,14.191538,14.288319,14.280318,14.184694,14.342566,14.321914,14.286223,14.482700,13.994615,13.895074,...,13.996474,13.817163,13.910791,13.989786,13.788973,13.667170,13.743528,158.100508,-20.848159,novadaq-far-farm-07
novadaq-far-farm-08,13.867709,13.980150,14.257958,14.140448,14.328602,14.620714,14.689935,14.963546,14.885335,14.727220,...,14.356937,13.983936,13.894878,13.987314,13.988768,13.935955,14.041698,162.805725,-21.531884,novadaq-far-farm-08
novadaq-far-farm-09,14.365239,14.451469,14.593383,14.473979,14.580755,14.654891,14.666837,15.004310,14.957136,14.849907,...,14.464945,14.188718,14.064464,14.182088,14.417803,14.237016,14.304609,163.021252,-22.132281,novadaq-far-farm-09
novadaq-far-farm-10,14.650544,14.760595,14.738633,14.645543,14.782156,14.891453,14.823037,15.081571,14.920467,14.876750,...,13.563106,13.294218,13.257044,13.355054,13.298596,13.475724,13.900588,158.064192,-25.394730,novadaq-far-farm-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
novadaq-far-farm-95,-1.363917,-1.220481,0.221249,0.422926,0.133480,-0.145304,-0.245454,-0.083779,-0.217807,-0.313418,...,0.610606,0.038265,-0.312534,-0.621805,-1.030904,-0.984981,-1.384969,5.413388,4.231181,novadaq-far-farm-95
novadaq-far-farm-96,-0.780635,-0.132295,-0.086481,-0.178364,-0.335845,-0.040449,-0.329232,-0.455550,-0.921720,-0.752705,...,0.028650,0.190330,0.234107,-0.116123,-0.221321,0.092845,-0.634150,5.228870,5.424593,novadaq-far-farm-96
novadaq-far-farm-97,-0.161367,0.143991,0.214014,0.223463,0.335448,0.178709,-0.030628,-0.020182,-0.059246,-0.183174,...,0.357813,0.233694,0.037796,0.237247,0.709355,0.095341,0.117903,-2.040937,-6.190887,novadaq-far-farm-97
novadaq-far-farm-98,-0.465516,-1.039422,-0.519564,-0.603408,-0.616175,-0.682868,-0.658356,-0.673012,-0.493413,-0.636495,...,-0.117287,-0.374686,-0.351274,-0.615472,-0.423255,0.235899,0.933757,-2.570028,-3.034026,novadaq-far-farm-98


In [15]:
fig = px.scatter(df_pca, x="PC1", y="PC2", hover_data={'nodeId': True})

fig.update_layout(title="PCA Projection",
                  xaxis_title="PCA 1",
                  yaxis_title="PCA 2")

fig.show()

In [None]:
df_pivot['PC1'] = df_pivot['PC1']
df_pivot['PC2'] = df_pivot['PC2']
df_pivot.drop(columns=['nodeId']).to_csv('multiDR_results2.csv')