In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
# Load the dataset
file_path = '../data/rfcc_tuning_10min.csv'
df = pd.read_csv(file_path, delimiter=',')
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')
# 🔹 Store DateTime column separately
date_column = df[['DateTime']]
# Drop columns with more than 30% missing values
df.dropna(axis=1, thresh=int(0.7 * len(df)), inplace=True)
# Fill missing values by forward filling
df.ffill(inplace=True)

# Fill any remaining missing values by backward filling
df.bfill(inplace=True)

# 🔹 Drop non-numeric columns for PCA
df_numeric = df.drop(columns=['DateTime'])


In [7]:
# 🔹 Compute correlation matrix
corr_matrix = df_numeric.corr().abs()

# 🔹 Set a threshold for high correlation (e.g., 0.95)
threshold = 0.95

# 🔹 Define target variables (must always be kept)
target_columns = [
    '530R002D02.TI0036.MEAS',
]

# 🔹 Find and remove highly correlated features (except target variables)
columns_to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):  # Only check the upper triangle
        if corr_matrix.iloc[i, j] > threshold:
            col_to_remove = corr_matrix.columns[j]
            # Ensure we don't remove target variables
            if col_to_remove not in target_columns:
                columns_to_drop.add(col_to_remove)

# 🔹 Keep only the selected features (including target variables)
selected_features = [col for col in df_numeric.columns if col not in columns_to_drop or col in target_columns]
df_reduced = df_numeric[selected_features]

# 🔹 Add back the DateTime column
df_reduced = pd.concat([date_column.reset_index(drop=True), df_reduced], axis=1)

# 🔹 Save the reduced dataset
df_reduced.to_csv("../data/dataset_reduced-10min-one-target-variable-TI0036.csv", index=False)

print(f"✅ Feature selection complete! Reduced dataset shape: {df_reduced.shape}")
print(f"📉 Dropped {len(columns_to_drop)} highly correlated features")


✅ Feature selection complete! Reduced dataset shape: (57312, 27)
📉 Dropped 45 highly correlated features
