# Import Libraries

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score

import joblib

# Import Dataset

In [2]:
# Load the dataset
df = pd.read_csv('../data/raw_data.csv')

# Data Transformation

In [3]:
# One-hot encode the 'type' column
df = pd.get_dummies(df, columns=['Type'], prefix='', prefix_sep='')

# Convert the 'product ID' to numeric by extracting the numbers and categorizing the quality
df['product_quality'] = df['Product ID'].str.extract('([LMH])')[0]
df['product_serial'] = df['Product ID'].str.extract('(\d+)').astype(int)
df = df.drop('Product ID', axis=1)

# Feature Engineering

In [9]:
# Add new features for failure modes if needed
conditions = [
    # Tool Wear Failure
    (df['Tool wear [min]'].between(200, 240)),
    # Heat Dissipation Failure
    ((df['Air temperature [K]'] - df['Process temperature [K]']) < 8.6) & (df['Rotational speed [rpm]'] < 1380),
    # Power Failure
    (df['Torque [Nm]'] * df['Rotational speed [rpm]'] / (2 * np.pi / 60)).between(3500, 9000),
    # Overstrain Failure
    (df['Tool wear [min]'] * df['Torque [Nm]'] > df['product_quality'].map({'L': 11000, 'M': 12000, 'H': 13000})),
    # Random Failure
    (np.random.rand(len(df)) < 0.001)
]
# the specified conditions that relate to different types of failures in the machinery
df['failure_mode'] = np.select(conditions, [1, 2, 3, 4, 5], default=0)


# Split Data

In [12]:
X = df.drop(['UDI', 'Machine failure', 'failure_mode'], axis=1)
y = df['Machine failure']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepocessing Pipeline

In [13]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = ['product_quality']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# ML Models

In [23]:
# Define the models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# Train and evaluate the models
results = {}
for model_name, model in models.items():
    # Create a full pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = pipeline.predict(X_test)
    results[model_name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'report': classification_report(y_test, y_pred)
    }
    # Cross-validation scores
    scores = cross_val_score(pipeline, X, y, cv=5)
    results[model_name]['cross_val_accuracy'] = np.mean(scores)

# Print the results

# Accumulate model results in a list
data = []

for model_name, result in results.items():
    data.append({
        'Model': model_name,
        'Test Accuracy': result['accuracy'],
        'Cross-Validated Accuracy': result['cross_val_accuracy'],
        'Classification Report': result['report']
    })

# Convert list of results to a DataFrame
df_results = pd.DataFrame(data)

# Display the DataFrame
df_results

Unnamed: 0,Model,Test Accuracy,Cross-Validated Accuracy,Classification Report
0,RandomForest,0.999,0.9319,precision recall f1-score ...
1,GradientBoosting,0.9985,0.9855,precision recall f1-score ...
2,LogisticRegression,0.999,0.9991,precision recall f1-score ...


- Logistic Regression has the highest cross-validated accuracy, which suggests that it generalizes best to unseen data out of the three models.
- Both Logistic Regression and Random Forest have a perfect test accuracy of 0.9990, but Random Forest has a significantly lower cross-validated accuracy.

Given these results, Logistic Regression appears to be the best model in terms of generalization to new data, as indicated by its cross-validated accuracy. However, it's essential to look at the precision, recall, and f1-score for each class, especially if the classes are imbalanced or if the cost of false positives/negatives is high. If all metrics are satisfactory, Logistic Regression would be the preferred model based on this data.

# Save the Model

In [26]:
# Assuming 'LogisticRegression' has been determined to be the best model
best_model = LogisticRegression(max_iter=1000, random_state=42)

# Create a pipeline with preprocessing and the best model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', best_model)])

# Train the model
pipeline.fit(X_train, y_train)

# Save the model to disk
joblib.dump(pipeline, '../model/LogisticRegression_model.joblib')

print("The best model has been saved to disk as 'LogisticRegression_model.joblib'.")


The best model has been saved to disk as 'LogisticRegression_model.joblib'.
