In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


#load Dataset

In [2]:
data = pd.read_csv("/content/predictive_maintenance.csv")

In [3]:
data.head(10)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
5,6,M14865,M,298.1,308.6,1425,41.9,11,0,No Failure
6,7,L47186,L,298.1,308.6,1558,42.4,14,0,No Failure
7,8,L47187,L,298.1,308.6,1527,40.2,16,0,No Failure
8,9,M14868,M,298.3,308.7,1667,28.6,18,0,No Failure
9,10,M14869,M,298.5,309.0,1741,28.0,21,0,No Failure


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


In [5]:
print(data.describe())

               UDI  Air temperature [K]  Process temperature [K]  \
count  10000.00000         10000.000000             10000.000000   
mean    5000.50000           300.004930               310.005560   
std     2886.89568             2.000259                 1.483734   
min        1.00000           295.300000               305.700000   
25%     2500.75000           298.300000               308.800000   
50%     5000.50000           300.100000               310.100000   
75%     7500.25000           301.500000               311.100000   
max    10000.00000           304.500000               313.800000   

       Rotational speed [rpm]   Torque [Nm]  Tool wear [min]        Target  
count            10000.000000  10000.000000     10000.000000  10000.000000  
mean              1538.776100     39.986910       107.951000      0.033900  
std                179.284096      9.968934        63.654147      0.180981  
min               1168.000000      3.800000         0.000000      0.000000  
25

In [6]:
print(data.isnull().sum())

UDI                        0
Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
Failure Type               0
dtype: int64


In [7]:
data.duplicated().sum()

0

In [8]:
print(data.dtypes)

UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Target                       int64
Failure Type                object
dtype: object


#Extract All Categorical Features


In [9]:
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns
categorical_features = data.select_dtypes(include=['object']).columns


In [10]:
numerical_features

Index(['UDI', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target'],
      dtype='object')

In [11]:
categorical_features

Index(['Product ID', 'Type', 'Failure Type'], dtype='object')

# Take Copy From Original Data

In [12]:
df = data.copy()

# One Hot Encoding


In [13]:
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(data[categorical_features])
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(categorical_features))
data.drop(columns=categorical_features, inplace=True)
data = pd.concat([data, encoded_df], axis=1)

In [14]:
data

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Product ID_H29424,Product ID_H29425,Product ID_H29432,...,Product ID_M24859,Type_H,Type_L,Type_M,Failure Type_Heat Dissipation Failure,Failure Type_No Failure,Failure Type_Overstrain Failure,Failure Type_Power Failure,Failure Type_Random Failures,Failure Type_Tool Wear Failure
0,1,298.1,308.6,1551,42.8,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,298.2,308.7,1408,46.3,3,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,298.1,308.5,1498,49.4,5,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,298.2,308.6,1433,39.5,7,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,298.2,308.7,1408,40.0,9,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,298.8,308.4,1604,29.5,14,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9996,9997,298.9,308.4,1632,31.8,17,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9997,9998,299.0,308.6,1645,33.4,22,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9998,9999,299.0,308.7,1408,48.5,25,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Remove Target column (Predicted column )

In [15]:
X = data.drop(columns=["Target"])
y = data["Target"]


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Naive Bayes with Leakage

In [17]:
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

In [18]:
nb_predictions = naive_bayes_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy}")
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_predictions))

Naive Bayes Accuracy: 0.9955
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1939
           1       0.89      0.97      0.93        61

    accuracy                           1.00      2000
   macro avg       0.95      0.98      0.96      2000
weighted avg       1.00      1.00      1.00      2000



# Logestic Regression with Leakage

In [19]:
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Make predictions and evaluate
log_reg_predictions = log_reg_model.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions)
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, log_reg_predictions))

Logistic Regression Accuracy: 0.999
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1939
           1       1.00      0.97      0.98        61

    accuracy                           1.00      2000
   macro avg       1.00      0.98      0.99      2000
weighted avg       1.00      1.00      1.00      2000



In [20]:
df

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,No Failure


In [21]:
numerical_features

Index(['UDI', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target'],
      dtype='object')

In [22]:
categorical_features

Index(['Product ID', 'Type', 'Failure Type'], dtype='object')

In [23]:
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(encoded_features.toarray(), columns=encoder.get_feature_names_out(categorical_features))
df.drop(columns=categorical_features, inplace=True)
df = pd.concat([df, encoded_df], axis=1)

In [24]:
df

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Product ID_H29424,Product ID_H29425,Product ID_H29432,...,Product ID_M24859,Type_H,Type_L,Type_M,Failure Type_Heat Dissipation Failure,Failure Type_No Failure,Failure Type_Overstrain Failure,Failure Type_Power Failure,Failure Type_Random Failures,Failure Type_Tool Wear Failure
0,1,298.1,308.6,1551,42.8,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,298.2,308.7,1408,46.3,3,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,298.1,308.5,1498,49.4,5,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,298.2,308.6,1433,39.5,7,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,298.2,308.7,1408,40.0,9,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,298.8,308.4,1604,29.5,14,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9996,9997,298.9,308.4,1632,31.8,17,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9997,9998,299.0,308.6,1645,33.4,22,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9998,9999,299.0,308.7,1408,48.5,25,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [25]:
print("Data Types of Columns:")
print(df.dtypes)

Data Types of Columns:
UDI                                  int64
Air temperature [K]                float64
Process temperature [K]            float64
Rotational speed [rpm]               int64
Torque [Nm]                        float64
                                    ...   
Failure Type_No Failure            float64
Failure Type_Overstrain Failure    float64
Failure Type_Power Failure         float64
Failure Type_Random Failures       float64
Failure Type_Tool Wear Failure     float64
Length: 10016, dtype: object


In [26]:
df.columns


Index(['UDI', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target',
       'Product ID_H29424', 'Product ID_H29425', 'Product ID_H29432',
       ...
       'Product ID_M24859', 'Type_H', 'Type_L', 'Type_M',
       'Failure Type_Heat Dissipation Failure', 'Failure Type_No Failure',
       'Failure Type_Overstrain Failure', 'Failure Type_Power Failure',
       'Failure Type_Random Failures', 'Failure Type_Tool Wear Failure'],
      dtype='object', length=10016)

In [27]:
# numeric_columns = df.select_dtypes(include=['number']).columns

# # Step 3: Compute the correlation matrix with numeric columns only
# correlation_matrix = df[numeric_columns].corr()

# # Display the correlation matrix
# print("Correlation Matrix:")
# print(correlation_matrix)

# # Step 4: Create a heatmap to visualize the correlation matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

In [28]:
columns_to_drop = [
    'Failure Type_Heat Dissipation Failure', 'Failure Type_No Failure',
       'Failure Type_Overstrain Failure', 'Failure Type_Power Failure',
       'Failure Type_Random Failures', 'Failure Type_Tool Wear Failure'
]


In [29]:
df.drop(columns=columns_to_drop, inplace=True)

In [30]:
df.columns

Index(['UDI', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target',
       'Product ID_H29424', 'Product ID_H29425', 'Product ID_H29432',
       ...
       'Product ID_M24846', 'Product ID_M24847', 'Product ID_M24849',
       'Product ID_M24851', 'Product ID_M24855', 'Product ID_M24857',
       'Product ID_M24859', 'Type_H', 'Type_L', 'Type_M'],
      dtype='object', length=10010)

In [31]:
X_df= df.drop(columns=["Target"])
y_df= df["Target"]

In [32]:
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df,y_df, test_size=0.2, random_state=42)


In [33]:
naive_bayes_model_df= GaussianNB()
naive_bayes_model_df.fit(X_train_df, y_train_df)

In [34]:
nb_predictions_df= naive_bayes_model_df.predict(X_test_df)
nb_accuracy_df= accuracy_score(y_test_df, nb_predictions_df)
print(f"Naive Bayes Accuracy: {nb_accuracy_df}")
print("Naive Bayes Classification Report:")
print(classification_report(y_test_df, nb_predictions_df))

Naive Bayes Accuracy: 0.046
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.02      0.03      1939
           1       0.03      1.00      0.06        61

    accuracy                           0.05      2000
   macro avg       0.52      0.51      0.05      2000
weighted avg       0.97      0.05      0.03      2000



In [35]:
log_reg_model_df = LogisticRegression(max_iter=1000)
log_reg_model_df.fit(X_train_df, y_train_df)

# Make predictions and evaluate
log_reg_predictions_df= log_reg_model_df.predict(X_test_df)
log_reg_accuracy_df= accuracy_score(y_test_df, log_reg_predictions_df)
print(f"Logistic Regression Accuracy: {log_reg_accuracy_df}")
print("Logistic Regression Classification Report:")
print(classification_report(y_test_df, log_reg_predictions_df))

Logistic Regression Accuracy: 0.9735
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1939
           1       0.64      0.30      0.40        61

    accuracy                           0.97      2000
   macro avg       0.81      0.64      0.70      2000
weighted avg       0.97      0.97      0.97      2000

