Feature Engineering

1. Time-Based Features

In [3]:
import pandas as pd

# Load dataset
data = pd.DataFrame({
    'timestamp': pd.date_range(start = '2024-05-01 00:00:00', periods = 4, freq = 'H'),
    'equipment_id': [1, 1, 1, 1],
    'temperature': [70, 71, 69, 72],
    'pressure': [30, 29, 31, 32],
    'vibration': [0.02, 0.03, 0.02, 0.04],
    'failure': [0, 0, 0, 1]
    
})

# Extract time-based features
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['is_working_hour'] = data['hour'].between(9, 17).astype(int)

print(data)

            timestamp  equipment_id  temperature  pressure  vibration  \
0 2024-05-01 00:00:00             1           70        30       0.02   
1 2024-05-01 01:00:00             1           71        29       0.03   
2 2024-05-01 02:00:00             1           69        31       0.02   
3 2024-05-01 03:00:00             1           72        32       0.04   

   failure  hour  day_of_week  is_working_hour  
0        0     0            2                0  
1        0     1            2                0  
2        0     2            2                0  
3        1     3            2                0  


2. Rolling Statistics

In [4]:
# Set the index to timestamp
data.set_index('timestamp', inplace=True)

# Calculate rolling statistics for temperature, pressure, and vibration
window_size = 2
data['temp_mean'] = data.groupby('equipment_id')['temperature'].rolling(window=window_size).mean().reset_index(0, drop=True)
data['temp_std'] = data.groupby('equipment_id')['temperature'].rolling(window=window_size).std().reset_index(0, drop=True)
data['temp_max'] = data.groupby('equipment_id')['temperature'].rolling(window=window_size).max().reset_index(0, drop=True)

data['pressure_mean'] = data.groupby('equipment_id')['pressure'].rolling(window=window_size).mean().reset_index(0, drop=True)
data['pressure_std'] = data.groupby('equipment_id')['pressure'].rolling(window=window_size).std().reset_index(0, drop=True)
data['pressure_max'] = data.groupby('equipment_id')['pressure'].rolling(window=window_size).max().reset_index(0, drop=True)

data['vibration_mean'] = data.groupby('equipment_id')['vibration'].rolling(window=window_size).mean().reset_index(0, drop=True)
data['vibration_std'] = data.groupby('equipment_id')['vibration'].rolling(window=window_size).std().reset_index(0, drop=True)
data['vibration_max'] = data.groupby('equipment_id')['vibration'].rolling(window=window_size).max().reset_index(0, drop=True)

print(data)

                     equipment_id  temperature  pressure  vibration  failure  \
timestamp                                                                      
2024-05-01 00:00:00             1           70        30       0.02        0   
2024-05-01 01:00:00             1           71        29       0.03        0   
2024-05-01 02:00:00             1           69        31       0.02        0   
2024-05-01 03:00:00             1           72        32       0.04        1   

                     hour  day_of_week  is_working_hour  temp_mean  temp_std  \
timestamp                                                                      
2024-05-01 00:00:00     0            2                0        NaN       NaN   
2024-05-01 01:00:00     1            2                0       70.5  0.707107   
2024-05-01 02:00:00     2            2                0       70.0  1.414214   
2024-05-01 03:00:00     3            2                0       70.5  2.121320   

                     temp_max  pressur

3. Lag Features

In [5]:
# Create lag features for temperature, pressure, and vibration
data['temp_lag1'] = data.groupby('equipment_id')['temperature'].shift(1)
data['pressure_lag1'] = data.groupby('equipment_id')['pressure'].shift(1)
data['vibration_lag1'] = data.groupby('equipment_id')['vibration'].shift(1)

print(data)

                     equipment_id  temperature  pressure  vibration  failure  \
timestamp                                                                      
2024-05-01 00:00:00             1           70        30       0.02        0   
2024-05-01 01:00:00             1           71        29       0.03        0   
2024-05-01 02:00:00             1           69        31       0.02        0   
2024-05-01 03:00:00             1           72        32       0.04        1   

                     hour  day_of_week  is_working_hour  temp_mean  temp_std  \
timestamp                                                                      
2024-05-01 00:00:00     0            2                0        NaN       NaN   
2024-05-01 01:00:00     1            2                0       70.5  0.707107   
2024-05-01 02:00:00     2            2                0       70.0  1.414214   
2024-05-01 03:00:00     3            2                0       70.5  2.121320   

                     temp_max  pressur