In [60]:
import pandas as pd

# Load your dataset
file_path = '../data/test_set_window.csv'
#file_path = '../data/rfcc_longest_active_window.csv'

df = pd.read_csv(file_path, delimiter=',')

# Convert DateTime to datetime object and set it as index
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')
df.set_index('DateTime', inplace=True)

# Define the attribute of interest
attribute  = "530R002D02.TI0036.MEAS"
attribute2 = "530M105D01.TIC0022.MEAS"
# Calculate the 10th and 90th percentile thresholds
lower_bound = df[attribute2].quantile(0.00155)
upper_bound = df[attribute].quantile(1)

# Label anomalies: 1 for anomaly, 0 for normal
df['labels'] = ((df[attribute2] < lower_bound) | (df[attribute] > upper_bound)).astype(int)

# Create cleaned version
df_cleaned = df.copy()

# Get columns to interpolate (excluding the label column)
columns_to_interpolate = [col for col in df.columns if col != 'labels']

# Replace all values in anomalous rows with NaN for selected columns
df_cleaned.loc[df_cleaned['labels'] == 1, columns_to_interpolate] = None

# Interpolate the entire DataFrame (only on the selected columns)
df_cleaned[columns_to_interpolate] = df_cleaned[columns_to_interpolate].interpolate(
    method='linear', limit_direction='both'
)


In [61]:
df_cleaned.shape

(7331, 27)

In [62]:
lower_bound

674.6922172851563

In [None]:
674.7158203125

In [63]:
upper_bound

722.640625

In [None]:
740.859375

In [64]:
# ========================
# Compute Anomaly Percentage
# ========================
anomaly_percentage = 100 * df['labels'].mean()
print(f"📊 Anomalies represent {anomaly_percentage:.2f}% of the dataset.")


📊 Anomalies represent 0.16% of the dataset.


In [65]:
# Save original + labeled version
df.to_csv('../data/test_set.csv')

#df.to_csv('../data/preprocessed_data.csv')
# Save cleaned + labeled version
#df_cleaned.to_csv('../data/cleaned_labeled_dataset.csv')

print("✅ Saved:")
print("- 'original_labeled_dataset.csv' with original data + labels")
print("- 'cleaned_labeled_dataset.csv' with anomalies replaced + labels")

✅ Saved:
- 'original_labeled_dataset.csv' with original data + labels
- 'cleaned_labeled_dataset.csv' with anomalies replaced + labels
