In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm

# Step 1: Merge all features from different tests
features_1st['test_set'] = '1st'
features_2nd['test_set'] = '2nd'
features_3rd['test_set'] = '3rd'

all_features = pd.concat([features_1st, features_2nd, features_3rd], ignore_index=True)

# Step 2: Create RUL labels based on the timestamp and the order of the measurements
def add_rul(df, test_set_name):
    subset = df[df['test_set'] == test_set_name].copy()
    subset = subset.sort_values('timestamp')  # Ensure proper time order
    n = len(subset)
    subset['RUL'] = np.linspace(n-1, 0, n)  # RUL = (number of files - current index)
    return subset

# Apply RUL labeling
all_features = pd.concat([
    add_rul(all_features, '1st'),
    add_rul(all_features, '2nd'),
    add_rul(all_features, '3rd')
], ignore_index=True)

# Step 3: Preprocess the data
# Drop non-feature columns like timestamp, test_set
X = all_features.drop(columns=['timestamp', 'test_set', 'RUL'])
y = all_features['RUL']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5: Train a model (Random Forest Regressor example)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = rf.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE: {rmse:.2f}, MAE: {mae:.2f}")

NameError: name 'features_1st' is not defined