In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
file_path = '../data/3MINUTES_IA_RFCC.csv'
df = pd.read_csv(file_path, delimiter=',')
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')
# 🔹 Store DateTime column separately
date_column = df[['DateTime']]
# Drop columns with more than 30% missing values
df.dropna(axis=1, thresh=int(0.7 * len(df)), inplace=True)
# Fill missing values by forward filling
df.ffill(inplace=True)

# Fill any remaining missing values by backward filling
df.bfill(inplace=True)

# 🔹 Drop non-numeric columns for PCA
df_numeric = df.drop(columns=['DateTime'])


In [3]:
# 🔹 Compute correlation matrix
corr_matrix = df_numeric.corr().abs()

# 🔹 Set a threshold for high correlation (e.g., 0.95)
threshold = 0.95

# 🔹 Define target variables (must always be kept)
target_columns = [
    '530R002D02.TI0037.MEAS',
]

# 🔹 Find and remove highly correlated features (except target variables)
columns_to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):  # Only check the upper triangle
        if corr_matrix.iloc[i, j] > threshold:
            col_to_remove = corr_matrix.columns[j]
            # Ensure we don't remove target variables
            if col_to_remove not in target_columns:
                columns_to_drop.add(col_to_remove)

# 🔹 Keep only the selected features (including target variables)
selected_features = [col for col in df_numeric.columns if col not in columns_to_drop or col in target_columns]
df_reduced = df_numeric[selected_features]

# 🔹 Add back the DateTime column
df_reduced = pd.concat([date_column.reset_index(drop=True), df_reduced], axis=1)

In [4]:
df_reduced = df_reduced.drop(columns=['500UZ0009E01.LZI0012A.MEAS'])

In [5]:
df_reduced

Unnamed: 0,DateTime,505D002D01.TI0012.MEAS,520MX051D01.FIC0028.MEAS,520D007D02.TI0058.MEAS,530K001S01.FI0007.MEAS,530R001D01.FI0043.MEAS,530E001D01.FIC0015.MEAS,530R001D01.FIC0029.MEAS,530R001D01.FIC0030.MEAS,530R001D01.FIC0047.MEAS,...,530F001D01.PIC0023.MEAS,530R002D02.TI0037.MEAS,530F001D01.TIC0012.MEAS,530M105D01.TIC0022.MEAS,530UZ1099E01.TZI0068A.MEAS,530M103D01.ZI2103A.MEAS,530M104D01.ZI2104A.MEAS,530M105D01.ZI2105A.MEAS,535D005D01.LI0011.MEAS,535INT920D01.TI0046.MEAS
0,2024-01-15 00:00:00,20.335938,4.381063,129.626953,56178.312500,152.258789,-1.352372,895.494263,267.269897,-0.107657,...,0.447500,103.289062,148.406250,598.248047,104.145309,7.552734,0.099609,99.542969,22.048666,53.353516
1,2024-01-15 00:03:00,20.234375,4.381063,129.626953,55734.804688,152.288223,-1.232607,895.626343,267.043640,-0.107657,...,0.447500,104.595314,151.390625,598.248047,104.407822,7.552734,0.099609,99.542969,22.054029,53.353516
2,2024-01-15 00:06:00,20.234375,4.381063,129.662109,55495.820312,152.448990,-1.232607,897.076660,267.007263,-0.107657,...,0.551406,106.348442,159.015625,598.248047,104.539078,7.552734,0.099609,99.542969,21.959824,53.353516
3,2024-01-15 00:09:00,20.234375,4.381063,129.662109,55868.722656,152.330978,-1.310423,900.294250,266.871307,-0.107657,...,0.551406,108.153130,169.984375,598.248047,104.670334,7.552734,0.099609,99.542969,21.961956,53.353516
4,2024-01-15 00:12:00,20.339844,4.381063,129.451172,55328.167969,152.399399,-1.310423,898.411011,266.910339,-0.107657,...,0.551406,110.129692,173.312500,598.248047,104.670334,7.552734,0.099609,99.644531,21.971840,53.353516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191035,2025-02-15 23:45:00,25.339844,3.711992,130.529297,93151.218750,2649.180664,951.240234,-40.892994,2511.096191,299.548462,...,1.901016,723.156250,136.453125,684.993164,150.085480,20.416016,10.869141,69.142578,13.569574,59.121094
191036,2025-02-15 23:48:00,25.542969,3.711992,130.529297,93151.656250,2653.438232,949.061890,-40.794693,2509.720703,299.876617,...,1.901016,723.242188,136.421875,685.570312,150.216736,20.779297,10.869141,69.109375,13.462749,59.121094
191037,2025-02-15 23:51:00,25.644531,3.711992,130.529297,93087.796875,2651.963135,948.824402,-41.088177,2511.453857,299.960480,...,1.901016,723.396912,136.484375,685.573242,150.216736,20.419922,10.869141,68.937500,13.462448,59.121094
191038,2025-02-15 23:54:00,25.750000,3.711992,130.658203,92886.695312,2646.366699,951.803406,-40.621750,2512.105225,300.138580,...,1.901016,723.053162,136.312500,685.655273,149.954224,20.705078,10.869141,69.144531,13.468741,59.121094


In [6]:
# 🔹 Save the reduced dataset
df_reduced.to_csv("../data/dataset_reduced-10min-one-target-variable-TI0036.csv", index=False)

print(f"✅ Feature selection complete! Reduced dataset shape: {df_reduced.shape}")
print(f"📉 Dropped {len(columns_to_drop)} highly correlated features")

✅ Feature selection complete! Reduced dataset shape: (191040, 26)
📉 Dropped 45 highly correlated features
