# Preprocessing & Target Creation - Failure in 24 Hours

**Goal:** Create the target variable `failure_in_24h` = 1 if machine fails within the next 24 records (simulating 24 hours).

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

HORIZON = 24  # Records ahead = "24 hours" (configurable)

In [None]:
# Load data
df = pd.read_csv('archive/ai4i2020.csv')
df = df.sort_values('UDI').reset_index(drop=True)
print(f"Shape: {df.shape}")
df.head()

## Create Target: failure_in_24h

For each row i: target = 1 if ANY failure in rows i+1 to i+HORIZON

In [None]:
def create_failure_in_horizon(df: pd.DataFrame, horizon: int = 24) -> pd.Series:
    """Create target: 1 if failure occurs in next 'horizon' rows."""
    n = len(df)
    target = np.zeros(n, dtype=int)
    failures = df['Machine failure'].values
    
    for i in range(n - horizon):
        if np.any(failures[i + 1 : i + 1 + horizon]):
            target[i] = 1
    
    return pd.Series(target, index=df.index)

df['failure_in_24h'] = create_failure_in_horizon(df, HORIZON)
print("Target distribution:")
print(df['failure_in_24h'].value_counts())
print(f"\nPositive rate: {df['failure_in_24h'].mean()*100:.2f}%")

In [None]:
# Drop last HORIZON rows (no future data to predict)
df_trimmed = df.iloc[:-HORIZON].copy()
print(f"Rows after trimming: {len(df_trimmed)}")
print(df_trimmed['failure_in_24h'].value_counts())

## Feature Engineering

In [None]:
# Derived features (from domain knowledge)
df_trimmed['Power_W'] = df_trimmed['Torque [Nm]'] * df_trimmed['Rotational speed [rpm]'] * (2 * np.pi / 60)
df_trimmed['Temp_diff_K'] = df_trimmed['Process temperature [K]'] - df_trimmed['Air temperature [K]']
df_trimmed['Overstrain_proxy'] = df_trimmed['Tool wear [min]'] * df_trimmed['Torque [Nm]']

In [None]:
# Encode Type (L, M, H)
df_trimmed['Type_encoded'] = df_trimmed['Type'].map({'L': 0, 'M': 1, 'H': 2})
df_trimmed[['Type', 'Type_encoded']].drop_duplicates()

In [None]:
# Feature columns for modeling
feature_cols = [
    'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]',
    'Torque [Nm]', 'Tool wear [min]',
    'Power_W', 'Temp_diff_K', 'Overstrain_proxy', 'Type_encoded'
]
target_col = 'failure_in_24h'

X = df_trimmed[feature_cols]
y = df_trimmed[target_col]

print("Features:", feature_cols)
print("\nX shape:", X.shape)
print("y shape:", y.shape)

## Train/Test Split (Time-Based)

Use first 80% for train, last 20% for test - no shuffling (preserve temporal order).

In [None]:
split_idx = int(len(df_trimmed) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Train: {len(X_train)} samples, {y_train.sum()} positives ({y_train.mean()*100:.2f}%)")
print(f"Test:  {len(X_test)} samples, {y_test.sum()} positives ({y_test.mean()*100:.2f}%)")

In [None]:
# Save processed data for modeling notebook
Path('data').mkdir(exist_ok=True)
df_trimmed.to_csv('data/processed_data.csv', index=False)
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

import json
with open('data/feature_cols.json', 'w') as f:
    json.dump(feature_cols, f)

print("Saved to data/")
print("  - processed_data.csv")
print("  - X_train.csv, X_test.csv")
print("  - y_train.csv, y_test.csv")
print("  - feature_cols.json")