In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Ingest Data from CSVs
def ingest_data(csv_files):
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

# Step 2: Handle Missing Values
def handle_missing_values(df, method='interpolate'):
    # Convert timestamp to datetime 
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp')
    
    # Interpolate missing values in sensor columns
    sensor_cols = ['Vibration (m/s²)', 'Temperature (°C)']  
    df[sensor_cols] = df[sensor_cols].interpolate(method='linear', limit_direction='both')
    
   # df = df.dropna()
    df = df.dropna(subset=[
        col for col in df.columns
        if 'roll_' in col or 'lag_' in col
    ])

    return df.reset_index()

# Step 3: Generate Lag Features
def generate_lag_features(df, lags=[1, 2]):
    sensor_cols = ['Vibration (m/s²)', 'Temperature (°C)']
    df = df.sort_values(['Sensor_ID', 'Timestamp'])  
    
    for col in sensor_cols:
        for lag in lags:
            df[f'{col}_lag_{lag}'] = df.groupby('Sensor_ID')[col].shift(lag)
    
   # df = df.dropna()
    df = df.fillna(method='bfill')
    return df

# Step 4: Generate Rolling Statistics
def generate_rolling_stats(df, windows=[3, 6, 12]):
    sensor_cols = ['Vibration (m/s²)', 'Temperature (°C)']
    df = df.sort_values(['Sensor_ID', 'Timestamp'])

    for col in sensor_cols:
        for window in windows:
            df[f'{col}_roll_mean_{window}'] = (
                df.groupby('Sensor_ID')[col]
                  .rolling(window, min_periods=1)
                  .mean()
                  .reset_index(level=0, drop=True)
            )

    return df

def transform_sensor_logs(csv_files, output_file='modeling_dataset.csv'):
    # Ingest
    df = ingest_data(csv_files)
    print(f"Step 1: Ingested data shape: {df.shape}")
    
    # Handle missing values
    df = handle_missing_values(df)
    print(f"Step 2: After handling missing values: {df.shape}")
    
    # Generate lag features
    df = generate_lag_features(df)
    print(f"Step 3: After generating lag features: {df.shape}")
    
    # Generate rolling statistics
    df = generate_rolling_stats(df)
    print(f"Step 4: After generating rolling stats: {df.shape}")


    # Save the transformed dataset
    df.to_csv(output_file, index=False)
    print(f"Dataset saved to {output_file}")
    
    return df

if __name__ == "__main__":
    csv_files = ['sensor_maintenance_data.csv'] 
    transformed_df = transform_sensor_logs(csv_files)
    print("Transformation complete. Preview of final dataset:")
    print(transformed_df.head())

Step 1: Ingested data shape: (500, 27)
Step 2: After handling missing values: (500, 27)
Step 3: After generating lag features: (500, 31)
Step 4: After generating rolling stats: (500, 37)
Dataset saved to modeling_dataset.csv
Transformation complete. Preview of final dataset:
              Timestamp Sensor_ID  Voltage (V)  Current (A)  Temperature (°C)   
0   2024-01-01 00:00:00       S_1          110          0.5                20  \
9   2024-01-01 09:00:00      S_10          119          0.9                29   
99  2024-01-05 03:00:00     S_100          119          0.9                29   
100 2024-01-05 04:00:00     S_101          110          0.5                20   
101 2024-01-05 05:00:00     S_102          111          0.6                21   

     Power (W)  Humidity (%)  Vibration (m/s²) Equipment_ID   
0         55.0            40              0.20          E_1  \
9        107.1            49              0.40         E_10   
99       107.1            49              0.40  

In [2]:
temp_threshold = transformed_df['Temperature (°C)'].quantile(0.90)
vib_threshold = transformed_df['Vibration (m/s²)'].quantile(0.90)

print("Temp threshold:", temp_threshold)
print("Vibration threshold:", vib_threshold)

transformed_df['Failure_next_24h'] = (
    (transformed_df['Temperature (°C)'] > temp_threshold) |
    (transformed_df['Vibration (m/s²)'] > vib_threshold)
).astype(int)

print(transformed_df['Failure_next_24h'].value_counts())


Temp threshold: 28.100000000000023
Vibration threshold: 0.4
Failure_next_24h
0    450
1     50
Name: count, dtype: int64


In [3]:
transformed_df.to_csv("modeling_dataset_with_label.csv", index=False)


In [4]:
print(transformed_df['Failure_next_24h'].value_counts())


Failure_next_24h
0    450
1     50
Name: count, dtype: int64


In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score


In [6]:
# Loading the transformed dataset
df = pd.read_csv("modeling_dataset_with_label.csv")

print("Dataset shape:", df.shape)
print(df['Failure_next_24h'].value_counts(normalize=True))


Dataset shape: (500, 38)
Failure_next_24h
0    0.9
1    0.1
Name: proportion, dtype: float64


In [7]:
target_col = 'Failure_next_24h'

# Drop non-feature columns
drop_cols = ['Timestamp', 'Sensor_ID', target_col]

X = df.drop(columns=drop_cols)
y = df[target_col]

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (400, 35)
Test shape: (100, 35)
