# PROJECT: AI4I 2020 Predictive Maintenance System
# AUTHOR: [Senin AdÄ±n/Burak]
# DESCRIPTION: End-to-end Machine Learning project to predict machine failures based on sensor data using Logistic Regression and Random Forest.
# DATASET: UCI Machine Learning Repository - AI4I 2020 Predictive Maintenance

In [None]:
# 1. IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

In [None]:
# 2. DATA LOADING
# Fetching dataset from UCI Repository
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv"
try:
    df = pd.read_csv(url)
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading data: {e}")

# Display first 5 rows to understand the structure
print("\nFirst 5 rows of the dataset:")
df.head()

# A) DATA PREPROCESSING

In [None]:
# Step 1: Dropping Irrelevant Columns
# 'UDI' and 'Product ID' are identifiers and do not contribute to physical failure prediction.
df_clean = df.drop(['UDI', 'Product ID'], axis=1)

# Step 2: Feature Engineering
# Creating a new feature 'Delta_Temp' representing the difference between Process and Air temperature.
# This helps in understanding heat dissipation efficiency.
df_clean['Delta_Temp'] = df_clean['Process temperature [K]'] - df_clean['Air temperature [K]']

# Step 3: Encoding Categorical Variables
# The 'Type' column (L, M, H) is categorical. Converting it to numerical format using Label Encoding.
le = LabelEncoder()
df_clean['Type'] = le.fit_transform(df_clean['Type'])

# Step 4: Preventing Data Leakage
# Columns like 'TWF', 'HDF', 'PWF', 'OSF', 'RNF' represent specific failure types.
# Including them would cause data leakage as they directly indicate the target.
# We remove them to predict 'Machine failure' solely based on sensor readings.
X = df_clean.drop(['Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis=1)
y = df_clean['Machine failure']

print("\nData Preprocessing completed. Ready for EDA and Modeling.")

# B) EXPLORATORY DATA ANALYSIS (EDA)

In [None]:
# 1. Correlation Matrix Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df_clean.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Features')
plt.savefig('correlation_matrix.png') # Saving the plot for the report
plt.show()

# 2. Boxplot: Torque Distribution by Machine Failure Status
plt.figure(figsize=(8, 5))
sns.boxplot(x='Machine failure', y='Torque [Nm]', data=df_clean, palette='Set2')
plt.title('Torque Distribution by Machine Failure Status')
plt.xlabel('Machine Failure (0: No Failure, 1: Failure)')
plt.ylabel('Torque [Nm]')
plt.savefig('boxplot_torque.png') # Saving the plot for the report
plt.show()

# C) MODELING & EVALUATION

In [None]:
# 1. Train/Test Split (70% Training, 30% Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Feature Scaling
# Standardization is crucial for Logistic Regression to perform well.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Model 1: Logistic Regression ---
print("\n--- Training Logistic Regression ---")
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))

In [None]:
# --- Model 2: Random Forest Classifier ---
print("\n--- Training Random Forest Classifier ---")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluation Metrics
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# Visualizing Confusion Matrix for Random Forest
cm = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Random Forest)')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.savefig('confusion_matrix_rf.png')
plt.show()

In [None]:
# Feature Importance Analysis
feature_imp = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_imp, y=feature_imp.index, palette='viridis')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.savefig('feature_importance.png')
plt.show()

print("\nProject execution completed successfully.")