In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [28]:
# Loading the dataset
df = pd.read_csv("equipment_failure_dataset.csv")
df.head()


Unnamed: 0,equipment_age,capacity,hours_used_per_week,last_maintenance_months,maintenance_type,equipment_type_Microphone,equipment_type_Projector,equipment_type_Smart Board,failure_occurred
0,4,60,10.933628,4,0,False,True,False,0
1,4,60,10.933628,4,0,False,True,False,1
2,4,60,10.933628,15,1,False,True,False,0
3,4,60,10.933628,15,1,False,True,False,1
4,2,60,27.350395,16,0,False,False,True,0


In [29]:
X = df.drop(columns=["failure_occurred"])
y = df["failure_occurred"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print("Class distribution: ")
print(y_train.value_counts(normalize=True))


Class distribution:
failure_occurred
0    0.685185
1    0.314815
Name: proportion, dtype: float64


In [31]:
# Helper function to evaluate models

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred)
    }



In [32]:
# Logistic Regression

lr_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=1000))
])

lr_pipeline.fit(X_train, y_train)
lr_results = evaluate_model(lr_pipeline, X_test, y_test)
lr_results


{'accuracy': 0.6296296296296297,
 'precision': 0.4,
 'recall': 0.2222222222222222,
 'f1_score': 0.2857142857142857}

In [33]:
# Decision Tree

dt_model = DecisionTreeClassifier(
    max_depth=5,
    random_state=42
)

dt_model.fit(X_train, y_train)
dt_results = evaluate_model(dt_model, X_test, y_test)
dt_results


{'accuracy': 0.5555555555555556,
 'precision': 0.2857142857142857,
 'recall': 0.2222222222222222,
 'f1_score': 0.25}

In [34]:
# Random Forest

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    random_state=42
)

rf_model.fit(X_train, y_train)
rf_results = evaluate_model(rf_model, X_test, y_test)
rf_results


{'accuracy': 0.48148148148148145,
 'precision': 0.2222222222222222,
 'recall': 0.2222222222222222,
 'f1_score': 0.2222222222222222}

In [35]:
# Comparing results

results_df = pd.DataFrame.from_dict({
    "Logistic Regression": lr_results,
    "Decision Tree": dt_results,
    "Random Forest": rf_results
}, orient="index")

results_df


Unnamed: 0,accuracy,precision,recall,f1_score
Logistic Regression,0.62963,0.4,0.222222,0.285714
Decision Tree,0.555556,0.285714,0.222222,0.25
Random Forest,0.481481,0.222222,0.222222,0.222222
