In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [2]:
# Load the dataset
df = pd.read_csv("training_data_rolling.csv")

# Identify sensor columns (Features for PCA)
# We select all columns starting with 'Sensed_' to include raw readings, rolling means, and rolling stds.
feature_cols = [c for c in df.columns if c.startswith('Sensed_')]

print(f"Loaded data with shape: {df.shape}")
print(f"Selected {len(feature_cols)} sensor features for dimensionality reduction.")

Loaded data with shape: (8004, 392)
Selected 384 sensor features for dimensionality reduction.


In [8]:
# 1. Standardize the data (Vital for PCA as sensors have different units/scales)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[feature_cols])

# 2. Apply PCA
# We'll calculate enough components to see the variance explaination, but focus on the first few
pca = PCA(n_components=10)
pca_result = pca.fit_transform(X_scaled)

# 3. Create a 'Health Index' from the first Principal Component (PC1)
# Note: PCA sign is arbitrary. We might need to flip it so it correlates with degradation.
# Ideally, we want the index to change monotonically with 'Cycles_Since_New'.
df['Health_Index'] = pca_result[:, 0]

# Calculate correlations to check directions
corr_cycles = df[['Health_Index', 'Cycles_Since_New']].corr().iloc[0, 1]
print(f"Correlation between Health Index (PC1) and Age (Cycles): {corr_cycles:.3f}")

# If negative correlation with age, flip the sign to make "Health Index" typically decrease (or increase) with age 
# depending on preference. Here let's make it track 'Decline' (Higher value = More deviation/Degradation)
if corr_cycles < 0:
    df['Health_Index'] = -df['Health_Index']
    print("Flipped Health Index sign to correlate positively with Age/Degradation.")

print(f"Explained Variance Ratio by Component: {pca.explained_variance_ratio_}")
print(f"Total Variance Explained by PC1+PC2: {sum(pca.explained_variance_ratio_):.2%}")

Correlation between Health Index (PC1) and Age (Cycles): 0.000
Explained Variance Ratio by Component: [0.19041349 0.09628961 0.07325077 0.05923936 0.04089042 0.03934005
 0.03343924 0.03035639 0.02761106 0.02499402]
Total Variance Explained by PC1+PC2: 61.58%


In [4]:
pca_result

array([[ 6.77306811,  8.26908701, -9.28732172, -0.0228766 ,  3.61297751],
       [ 7.0292084 ,  9.10472249, -5.76284657, -0.09992736,  4.77702335],
       [ 6.56912193,  9.61453218, -7.9729705 ,  0.12743041,  4.86152627],
       ...,
       [ 2.07713719, -4.33033602,  5.21589462, -0.917361  ,  1.53264547],
       [ 1.39546039, -6.30620706,  3.30771666, -2.32795021,  3.44775013],
       [ 0.65553223, -2.72116193,  7.15804871, -3.93376642, -1.79548742]],
      shape=(8004, 5))