In [2]:
import numpy as np
import pandas as pd 
import plotly.express as px
import json 
import os

df_f = pd.read_csv('../data/farm/far_data_2024-02-21.csv').fillna(0.0)
df_f.head()

Unnamed: 0,timestamp,cpu_system,boottime,Pool Size Time_P1,mem_free,Missed Buffers_P1,bytes_out,cpu_user,cpu_idle,Pool Size Data_P1,...,RetransSegs_rate,TCPLostRetransmit_rate,TCPForwardRetrans_rate,TotalRetrans,TCPSlowStartRetrans,RPCRetrans,TCPFastRetrans,TCPLostRetransmit,TCPForwardRetrans,RetransSegs
0,2024-02-21 05:36:00,2.1,0.0,0.0,0.0,0.0,0.0,1.006667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-02-21 05:36:15,2.1,0.0,2038.71,0.0,0.0,0.0,1.3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2024-02-21 05:36:30,1.993333,0.0,2055.49,0.0,0.0,0.0,1.3,0.0,11795.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2024-02-21 05:36:45,1.806667,0.0,2049.026,0.0,0.0,0.0,1.3,0.0,11892.133333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-02-21 05:37:00,2.1,1705514000.0,2014.643333,0.0,0.0,0.0,1.11,0.0,11854.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dimensionality Reduction (DR) across feature domain, time domain

In [3]:
from pygam import GAM, s
from scipy.interpolate import BSpline, splrep, splev
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def preprocess(df, timestamp):
    return df[df['timestamp']==timestamp].set_index('nodeId').drop(columns='timestamp')
    
# DR1: Applying PCA to each timestamp across features
def apply_pca_to_time(ts, df):
    try:
        # pivot: rows -> features, columns -> nodeId
        X = preprocess(df, ts)

        # convert to cupy array
        # baseline = cp.array(X.values)
        baseline = X.values

        # normalizing the data (demean)
        mean_hat = baseline.mean(axis=0)
        demeaned = baseline - mean_hat

        # standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(demeaned)

        # if (X_scaled.shape[0] < 2 or np.all(np.isnan(X_scaled)) or np.all(X_scaled == 0)):
        if X_scaled.shape[0] < 2 or np.all(np.isnan(X_scaled)):
            print(f"Skipping {ts} due to insufficient data variance.")
            return None

        # apply PCA
        pca = PCA(n_components=1)
        scores = pca.fit_transform(X_scaled)

        explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
        npc = np.sum(explained_variance_ratio_cumsum < 0.9999) + 1
        # print(f"Number of principal components: {npc}")
        # print(f"PC1:",pca.components_[1], len(pca.components_[1]))
        abs_comp = np.abs(pca.components_[0])
        top_10 = np.argsort(abs_comp)[-10:][::-1]
        # top 10 most influential features for this column
        fc_f = X.columns[top_10]

        P_fin = pd.DataFrame({f"PC{k+1}": scores[:, k] if k < npc else np.nan for k in range(1)})
        P_fin['Measurement'] = X.index

        fc_f_df = pd.DataFrame({'timestamp': ts, 'feature': fc_f})

        # print(f"{col_name} done...")

        return P_fin, fc_f_df

    except Exception as e:
        print(f"Error processing PCA across features: {e}")
        return None

Some timestamps only return one row (node)

In [3]:
df_f[df_f['timestamp']=='2024-02-21 05:37:00']

Unnamed: 0,timestamp,cpu_system,boottime,Pool Size Time_P1,mem_free,Missed Buffers_P1,bytes_out,cpu_user,cpu_idle,Pool Size Data_P1,...,RetransSegs_rate,TCPLostRetransmit_rate,TCPForwardRetrans_rate,TotalRetrans,TCPSlowStartRetrans,RPCRetrans,TCPFastRetrans,TCPLostRetransmit,TCPForwardRetrans,RetransSegs
4,2024-02-21 05:37:00,2.1,1705514000.0,2014.643333,0.0,0.0,0.0,1.11,0.0,11854.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Only getting timestamps where we have readings from all 195 nodes

In [4]:
num_nodes = 195
valid_ts = valid_ts = df_f.groupby('timestamp').filter(lambda x: x['nodeId'].nunique() == num_nodes)['timestamp'].unique()
print('valid timestamps:', len(valid_ts))
print(valid_ts)

valid timestamps: 1890
['2024-02-21 16:07:30' '2024-02-21 16:07:45' '2024-02-21 16:08:00' ...
 '2024-02-21 23:59:15' '2024-02-21 23:59:30' '2024-02-21 23:59:45']


DR1: Applying PCA across features for each timestamp

In [11]:
from concurrent.futures import ThreadPoolExecutor

P_final = []
FC_final = []

start_idx = int(len(valid_ts) * 0.3)
end_idx = int(len(valid_ts) * 0.45)
timestamps = valid_ts[start_idx:end_idx]

def process_ts(ts):
    try:
        P_df, fc_f_df = apply_pca_to_time(ts, df_f)
        
        if P_df is not None:
            P_df.insert(0, 'Col', ts) 
            P_final.append(P_df)

        if fc_f_df is not None:
            FC_final.append(fc_f_df)  
        
    except Exception as e:
        print(f'Error processing {ts}: {e}')

with ThreadPoolExecutor() as executer:
    executer.map(process_ts, timestamps)

# combining all results
P_final = pd.concat(P_final, ignore_index=True) if P_final else pd.DataFrame()
FC_final = pd.concat(FC_final, ignore_index=True) if FC_final else pd.DataFrame()

In [12]:
P_final

Unnamed: 0,Col,PC1,Measurement
0,2024-02-21 18:30:15,13.595053,novadaq-far-farm-92
1,2024-02-21 18:30:15,-2.431512,novadaq-far-farm-184
2,2024-02-21 18:30:15,0.012991,novadaq-far-farm-160
3,2024-02-21 18:30:15,-0.329353,novadaq-far-farm-70
4,2024-02-21 18:30:15,0.391004,novadaq-far-farm-185
...,...,...,...
55180,2024-02-21 19:38:45,5.356843,novadaq-far-farm-92
55181,2024-02-21 19:38:45,-3.186326,novadaq-far-farm-185
55182,2024-02-21 19:38:45,-1.176038,novadaq-far-farm-193
55183,2024-02-21 19:38:45,4.609285,novadaq-far-farm-164


In [13]:
FC_final.head(20)

Unnamed: 0,timestamp,feature
0,2024-02-21 18:30:15,cpu_idle
1,2024-02-21 18:30:15,boottime
2,2024-02-21 18:30:15,part_max_used
3,2024-02-21 18:30:15,cpu_speed
4,2024-02-21 18:30:15,mem_cached
5,2024-02-21 18:30:15,proc_total
6,2024-02-21 18:30:15,mem_total
7,2024-02-21 18:30:15,mem_free
8,2024-02-21 18:30:15,disk_total
9,2024-02-21 18:30:15,cpu_aidle


In [22]:
FC_final.to_csv('FC_final.csv')

In [16]:
P_final.to_csv('P_final2.csv')

Second step DR: UMAP

In [14]:
from umap import UMAP
import plotly.express as px

# two-step DR: PCA+UMAP
# applying UMAP to each node in variable domain to get nodes (Measurements) in 2D
df_pivot = P_final.pivot(index="Measurement", columns="Col", values="PC1")
reducer = UMAP(n_components=2, random_state=42)
embedding = reducer.fit_transform(df_pivot)
df_pivot['UMAP1'] = embedding[:, 0]
df_pivot['UMAP2'] = embedding[:, 1]
df_pivot['nodeId'] = df_pivot.index 
fig = px.scatter(df_pivot, x='UMAP1', y='UMAP2', hover_data={'nodeId': True})

fig.update_layout(title="UMAP Projection",
                  xaxis_title="UMAP 1",
                  yaxis_title="UMAP 2")
fig.show()

  from .autonotebook import tqdm as notebook_tqdm
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Second step DR: tSNE

In [15]:
import plotly.express as px
import pandas as pd
from sklearn.manifold import TSNE

# two-step DR: PCA+tSNE
df_tsne2 = df_pivot.drop(columns=['UMAP1', 'UMAP2', 'nodeId'])
tsne = TSNE(n_components=2, random_state=42)
embedding = tsne.fit_transform(df_tsne2)

df_pivot['tSNE1'] = embedding[:, 0]
df_pivot['tSNE2'] = embedding[:, 1]

fig = px.scatter(df_pivot, x='tSNE1', y='tSNE2', hover_data={'nodeId': True})

fig.update_layout(title="t-SNE Projection",
                  xaxis_title="t-SNE 1",
                  yaxis_title="t-SNE 2")

fig.show()


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



Second step DR: PCA

In [16]:
import plotly.express as px
import pandas as pd

# DR2 - applying PCA to each feature across one node/component 
def apply_pca(X, n_components=2):
    try:
        baseline = X.values

        # normalizing the data (demean)
        mean_hat = baseline.mean(axis=0)
        demeaned = baseline - mean_hat

        # standardize
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(demeaned)

        # apply PCA
        pca = PCA(n_components=n_components)
        scores = pca.fit_transform(X_scaled)

        explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
        npc = np.sum(explained_variance_ratio_cumsum < 0.9999) + 1
        n_components = scores.shape[1]
        print(f"Number of principal components: {n_components}")
        

        P_fin = pd.DataFrame({f"PC{k+1}": scores[:, k] if k < n_components else np.nan for k in range(3)})
        P_fin['Measurement'] = X.index

        return P_fin

    except Exception as e:
        print(f"Error processing PCA across features: {e}")
        return None

In [17]:
df_pca = df_pivot.drop(columns=['UMAP1', 'UMAP2', 'tSNE1', 'tSNE2', 'nodeId'])
P_df = apply_pca(df_pca)
P_df

Number of principal components: 2


Unnamed: 0,PC1,PC2,PC3,Measurement
0,1.408272,3.836039,,novadaq-far-farm-06
1,1.676271,1.060038,,novadaq-far-farm-07
2,2.969684,4.789189,,novadaq-far-farm-08
3,2.251699,3.668458,,novadaq-far-farm-09
4,-2.268656,-0.807331,,novadaq-far-farm-10
...,...,...,...,...
190,2.933584,11.804179,,novadaq-far-farm-95
191,3.768397,11.093668,,novadaq-far-farm-96
192,-4.166802,2.229767,,novadaq-far-farm-97
193,-1.523876,2.658653,,novadaq-far-farm-98


In [18]:
P_df.set_index('Measurement', inplace=True)
df_pca['PC1'] = P_df['PC1']
df_pca['PC2'] = P_df['PC2']
df_pca['nodeId'] = P_df.index
df_pca

Col,2024-02-21 18:29:15,2024-02-21 18:29:30,2024-02-21 18:29:45,2024-02-21 18:30:00,2024-02-21 18:30:15,2024-02-21 18:30:30,2024-02-21 18:30:45,2024-02-21 18:31:00,2024-02-21 18:31:15,2024-02-21 18:31:30,...,2024-02-21 19:38:15,2024-02-21 19:38:30,2024-02-21 19:38:45,2024-02-21 19:39:00,2024-02-21 19:39:15,2024-02-21 19:39:30,2024-02-21 19:39:45,PC1,PC2,nodeId
Measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
novadaq-far-farm-06,-0.198384,0.039488,0.023610,-0.237311,-0.212725,0.142633,0.449152,0.596131,0.517030,0.510573,...,1.155812,1.099765,0.900930,0.418770,0.557354,0.470325,0.294177,1.408272,3.836039,novadaq-far-farm-06
novadaq-far-farm-07,0.504821,0.683554,0.803510,0.717423,0.865332,1.053720,1.145219,1.267338,1.163146,1.054525,...,0.669906,0.774441,0.690630,0.660161,0.600747,0.720751,0.852562,1.676271,1.060038,novadaq-far-farm-07
novadaq-far-farm-08,-0.168806,-0.439695,-0.396591,-0.346659,-0.223896,0.016551,0.475218,0.926195,0.759334,0.626266,...,1.006942,1.372481,1.372138,1.509317,1.496844,1.588132,1.663613,2.969684,4.789189,novadaq-far-farm-08
novadaq-far-farm-09,0.686600,0.768580,0.609316,0.268636,0.516960,0.767673,0.942082,1.041299,0.893513,0.776286,...,0.348898,0.670234,0.091758,0.232084,0.444204,0.437782,0.530220,2.251699,3.668458,novadaq-far-farm-09
novadaq-far-farm-10,0.053059,0.137471,0.141747,-0.030123,0.121336,0.305263,0.471781,0.663888,0.552687,0.360966,...,-0.064510,0.068036,-0.221675,-0.393095,-0.340732,-0.282804,-0.103621,-2.268656,-0.807331,novadaq-far-farm-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
novadaq-far-farm-95,-0.339057,-0.314944,-0.231916,-0.284658,-0.040883,-0.047766,-0.461716,-0.863113,-0.904743,-0.737088,...,2.207934,2.449463,2.660063,2.636534,2.620280,2.662446,2.246520,2.933584,11.804179,novadaq-far-farm-95
novadaq-far-farm-96,-0.019654,0.158686,0.236861,0.030676,-0.202409,-0.159199,-0.178469,-0.153871,-0.299224,-0.499804,...,0.605190,0.545090,1.085927,1.347924,1.362362,1.645015,1.804123,3.768397,11.093668,novadaq-far-farm-96
novadaq-far-farm-97,-1.724263,-1.467371,-1.401254,-1.164658,-1.155608,-1.168976,-0.971264,-0.965827,-1.173564,-1.039041,...,0.096184,0.058562,0.052340,-0.216871,0.117224,0.330893,-0.020413,-4.166802,2.229767,novadaq-far-farm-97
novadaq-far-farm-98,-0.565797,-0.559584,-0.795994,-1.109042,-0.883274,-0.662033,-0.709299,-0.917839,-0.865991,-0.740719,...,0.670164,0.484816,0.484195,0.651909,0.535688,0.692403,0.423803,-1.523876,2.658653,novadaq-far-farm-98


In [19]:
fig = px.scatter(df_pca, x="PC1", y="PC2", hover_data={'nodeId': True})

fig.update_layout(title="PCA Projection",
                  xaxis_title="PCA 1",
                  yaxis_title="PCA 2")

fig.show()

In [20]:
df_pivot

Col,2024-02-21 18:29:15,2024-02-21 18:29:30,2024-02-21 18:29:45,2024-02-21 18:30:00,2024-02-21 18:30:15,2024-02-21 18:30:30,2024-02-21 18:30:45,2024-02-21 18:31:00,2024-02-21 18:31:15,2024-02-21 18:31:30,...,2024-02-21 19:38:45,2024-02-21 19:39:00,2024-02-21 19:39:15,2024-02-21 19:39:30,2024-02-21 19:39:45,UMAP1,UMAP2,nodeId,tSNE1,tSNE2
Measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
novadaq-far-farm-06,-0.198384,0.039488,0.023610,-0.237311,-0.212725,0.142633,0.449152,0.596131,0.517030,0.510573,...,0.900930,0.418770,0.557354,0.470325,0.294177,14.855693,7.594246,novadaq-far-farm-06,-3.489189,1.873971
novadaq-far-farm-07,0.504821,0.683554,0.803510,0.717423,0.865332,1.053720,1.145219,1.267338,1.163146,1.054525,...,0.690630,0.660161,0.600747,0.720751,0.852562,14.707959,7.418636,novadaq-far-farm-07,-3.235267,1.475462
novadaq-far-farm-08,-0.168806,-0.439695,-0.396591,-0.346659,-0.223896,0.016551,0.475218,0.926195,0.759334,0.626266,...,1.372138,1.509317,1.496844,1.588132,1.663613,14.768608,7.510266,novadaq-far-farm-08,-3.561736,1.726350
novadaq-far-farm-09,0.686600,0.768580,0.609316,0.268636,0.516960,0.767673,0.942082,1.041299,0.893513,0.776286,...,0.091758,0.232084,0.444204,0.437782,0.530220,14.772607,7.504114,novadaq-far-farm-09,-3.498419,1.706694
novadaq-far-farm-10,0.053059,0.137471,0.141747,-0.030123,0.121336,0.305263,0.471781,0.663888,0.552687,0.360966,...,-0.221675,-0.393095,-0.340732,-0.282804,-0.103621,14.594674,7.033092,novadaq-far-farm-10,-2.469775,0.763147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
novadaq-far-farm-95,-0.339057,-0.314944,-0.231916,-0.284658,-0.040883,-0.047766,-0.461716,-0.863113,-0.904743,-0.737088,...,2.660063,2.636534,2.620280,2.662446,2.246520,3.708235,19.325142,novadaq-far-farm-95,-9.960502,4.640980
novadaq-far-farm-96,-0.019654,0.158686,0.236861,0.030676,-0.202409,-0.159199,-0.178469,-0.153871,-0.299224,-0.499804,...,1.085927,1.347924,1.362362,1.645015,1.804123,3.994422,18.785683,novadaq-far-farm-96,-9.349502,4.030197
novadaq-far-farm-97,-1.724263,-1.467371,-1.401254,-1.164658,-1.155608,-1.168976,-0.971264,-0.965827,-1.173564,-1.039041,...,0.052340,-0.216871,0.117224,0.330893,-0.020413,16.728779,7.221318,novadaq-far-farm-97,0.401477,5.435856
novadaq-far-farm-98,-0.565797,-0.559584,-0.795994,-1.109042,-0.883274,-0.662033,-0.709299,-0.917839,-0.865991,-0.740719,...,0.484195,0.651909,0.535688,0.692403,0.423803,15.383457,8.083879,novadaq-far-farm-98,-1.716707,4.209938


In [21]:
df_pivot['PC1'] = df_pca['PC1']
df_pivot['PC2'] = df_pca['PC2']
df_pivot.drop(columns=['nodeId']).to_csv('multiDR_results2.csv')