In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = '../data/rfcc_tuning_10min.csv'
df = pd.read_csv(file_path, delimiter=',')
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')

# Keep a copy of the DateTime column
date_column = df[['DateTime']]

# Filter condition
rfcc_active = df['530C021D01.FIC0203.MEAS'] >= 95

# Find start and end of continuous segments
df['rfcc_active'] = rfcc_active.astype(int)
df['group'] = (df['rfcc_active'] != df['rfcc_active'].shift()).cumsum()
active_groups = df[df['rfcc_active'] == 1].groupby('group')

# Find the longest continuous segment
longest_active_segment = active_groups.size().idxmax()
df = df[df['group'] == longest_active_segment].copy()

# Drop helper columns
df.drop(columns=['rfcc_active', 'group'], inplace=True)

# Print segment info
start_time = df['DateTime'].min()
end_time = df['DateTime'].max()
num_rows = len(df)
print(f"📅 Longest continuous active RFCC segment: {start_time} to {end_time}")
print(f"🔢 Number of rows in the segment: {num_rows}")

# Continue preprocessing
date_column = df[['DateTime']]
df.dropna(axis=1, thresh=int(0.7 * len(df)), inplace=True)
df.ffill(inplace=True)
df.bfill(inplace=True)

df_numeric = df.drop(columns=['DateTime'])
corr_matrix = df_numeric.corr().abs()
threshold = 0.95
target_columns = ['530R002D02.TI0036.MEAS']

columns_to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > threshold:
            col_to_remove = corr_matrix.columns[j]
            if col_to_remove not in target_columns:
                columns_to_drop.add(col_to_remove)

selected_features = [col for col in df_numeric.columns if col not in columns_to_drop or col in target_columns]
df_reduced = df_numeric[selected_features]
df_reduced = pd.concat([date_column.reset_index(drop=True), df_reduced], axis=1)

# Save the reduced dataset
df_reduced.to_csv("../data/dataset_reduced-10min-one-target-variable-TI0036.csv", index=False)

print(f"✅ Feature selection complete! Reduced dataset shape: {df_reduced.shape}")
print(f"📉 Dropped {len(columns_to_drop)} highly correlated features")
