In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("../data/raw/ai4i2020.csv")

df.columns = (
    df.columns
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("[", "", regex=False)
    .str.replace("]", "", regex=False)
)

df["temp_diff"] = df["process_temperature_k"] - df["air_temperature_k"]

In [3]:
target = "machine_failure"
failure_type_cols = ["twf", "hdf", "pwf", "osf", "rnf"]

X = df.drop(columns=[target] + failure_type_cols)
y = df[target]

In [4]:
categorical_features = ["type"]
numeric_features = [col for col in X.columns if col not in categorical_features]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ]
)

The initial baseline model failed because an identifier column, product_id, is categorical text and was incorrectly treated as numeric. Identifier fields do not represent machine behavior and can introduce noise or leakage, so we remove udi and product_id and proceed with type as the only categorical feature to be one hot encoded.

In [7]:
id_cols = ["udi", "product_id"]
X = X.drop(columns=id_cols)

categorical_features = ["type"]
numeric_features = [col for col in X.columns if col not in categorical_features]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ]
)

In [10]:
model.fit(X_train, y_train);

In [11]:
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90      2415
           1       0.14      0.81      0.24        85

    accuracy                           0.83      2500
   macro avg       0.57      0.82      0.57      2500
weighted avg       0.96      0.83      0.88      2500

[[1995  420]
 [  16   69]]


The baseline logistic regression model achieves high recall for machine failures (81%), indicating a strong ability to identify failure events in advance. Precision for failures is lower, reflecting a tradeoff between early detection and false alarms. In a manufacturing context, prioritizing recall is often preferable, as missed failures are more costly than additional preventive inspections. This model serves as a reasonable baseline for failure risk estimation.