In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data=pd.read_csv("/content/ai4i2020.csv")

In [4]:
data.head(5)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [7]:
data.drop(columns=["UDI","Product ID","TWF","HDF","PWF","OSF","RNF"],inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Type                     10000 non-null  object 
 1   Air temperature [K]      10000 non-null  float64
 2   Process temperature [K]  10000 non-null  float64
 3   Rotational speed [rpm]   10000 non-null  int64  
 4   Torque [Nm]              10000 non-null  float64
 5   Tool wear [min]          10000 non-null  int64  
 6   Machine failure          10000 non-null  int64  
dtypes: float64(3), int64(3), object(1)
memory usage: 547.0+ KB


In [9]:
data.head(10)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0
5,M,298.1,308.6,1425,41.9,11,0
6,L,298.1,308.6,1558,42.4,14,0
7,L,298.1,308.6,1527,40.2,16,0
8,M,298.3,308.7,1667,28.6,18,0
9,M,298.5,309.0,1741,28.0,21,0


In [10]:
data["Type"].unique()

array(['M', 'L', 'H'], dtype=object)

In [11]:
df_one_hot = pd.get_dummies(data, columns=['Type'], prefix='Type')


In [13]:
numerical_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]','Torque [Nm]','Tool wear [min]']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the selected columns
df_one_hot[numerical_columns] = scaler.fit_transform(df_one_hot[numerical_columns])

In [14]:
df_one_hot.head(10)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,Type_H,Type_L,Type_M
0,-0.952389,-0.94736,0.068185,0.2822,-1.695984,0,0,0,1
1,-0.902393,-0.879959,-0.729472,0.633308,-1.648852,0,0,1,0
2,-0.952389,-1.014761,-0.22745,0.94429,-1.61743,0,0,1,0
3,-0.902393,-0.94736,-0.590021,-0.048845,-1.586009,0,0,1,0
4,-0.902393,-0.879959,-0.729472,0.001313,-1.554588,0,0,1,0
5,-0.952389,-0.94736,-0.634645,0.191915,-1.523166,0,0,0,1
6,-0.952389,-0.94736,0.107231,0.242073,-1.476034,0,0,1,0
7,-0.952389,-0.94736,-0.065687,0.021376,-1.444613,0,0,1,0
8,-0.852397,-0.879959,0.715235,-1.142297,-1.413191,0,0,0,1
9,-0.752405,-0.677756,1.128009,-1.202487,-1.366059,0,0,0,1


In [16]:
X = df_one_hot.drop('Machine failure', axis=1)
y =df_one_hot['Machine failure']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (8000, 8) (8000,)
Testing set shape: (2000, 8) (2000,)


In [19]:
model = LogisticRegression(random_state=42)

In [20]:
model.fit(X_train, y_train)

In [21]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on the test set: {accuracy:.2f}")

# Additional evaluation metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy on the test set: 0.97

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1939
           1       0.67      0.26      0.38        61

    accuracy                           0.97      2000
   macro avg       0.82      0.63      0.68      2000
weighted avg       0.97      0.97      0.97      2000


Confusion Matrix:
[[1931    8]
 [  45   16]]
