In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("XAI_Drilling_Dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        20000 non-null  int64  
 1   Cutting speed vc [m/min]  20000 non-null  float64
 2   Spindle speed n [1/min]   20000 non-null  int64  
 3   Feed f [mm/rev]           20000 non-null  float64
 4   Feed rate vf [mm/min]     20000 non-null  int64  
 5   Power Pc [kW]             20000 non-null  float64
 6   Cooling [%]               20000 non-null  int64  
 7   Material                  20000 non-null  object 
 8   Drill Bit Type            20000 non-null  object 
 9   Process Time [sec]        20000 non-null  float64
 10  Main Failure              20000 non-null  int64  
 11  BEF                       20000 non-null  int64  
 12  CCF                       20000 non-null  int64  
 13  FWF                       20000 non-null  int64  
 14  WDF   

I will drop the last four coulumns because the model will classify only if there is a failure or not / not interested in the type of the failure 

In [5]:
df.columns

Index(['ID', 'Cutting speed vc [m/min]', 'Spindle speed n [1/min]',
       'Feed f [mm/rev]', 'Feed rate vf [mm/min]', 'Power Pc [kW]',
       'Cooling [%]', 'Material', 'Drill Bit Type', 'Process Time [sec]',
       'Main Failure', 'BEF', 'CCF', 'FWF', 'WDF'],
      dtype='object')

In [7]:
df=df.drop(['BEF', 'CCF', 'FWF', 'WDF',"ID"],axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Cutting speed vc [m/min]  20000 non-null  float64
 1   Spindle speed n [1/min]   20000 non-null  int64  
 2   Feed f [mm/rev]           20000 non-null  float64
 3   Feed rate vf [mm/min]     20000 non-null  int64  
 4   Power Pc [kW]             20000 non-null  float64
 5   Cooling [%]               20000 non-null  int64  
 6   Material                  20000 non-null  object 
 7   Drill Bit Type            20000 non-null  object 
 8   Process Time [sec]        20000 non-null  float64
 9   Main Failure              20000 non-null  int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 1.5+ MB


In [9]:
df["Main Failure"].value_counts()

Main Failure
0    18999
1     1001
Name: count, dtype: int64

the data set is unbalanced 

In [11]:
df["Material"].unique()

array(['N', 'P', 'K'], dtype=object)

In [12]:
df["Drill Bit Type"].unique()

array(['W', 'N', 'H'], dtype=object)

In [14]:
for col in ["Material","Drill Bit Type"]:
    dummies = pd.get_dummies(df[col],prefix=col)
    df.drop([col], axis=1,inplace=True)
    df = pd.concat([df, dummies], axis=1)




In [15]:
df.columns

Index(['Cutting speed vc [m/min]', 'Spindle speed n [1/min]',
       'Feed f [mm/rev]', 'Feed rate vf [mm/min]', 'Power Pc [kW]',
       'Cooling [%]', 'Process Time [sec]', 'Main Failure', 'Material_K',
       'Material_N', 'Material_P', 'Drill Bit Type_H', 'Drill Bit Type_N',
       'Drill Bit Type_W'],
      dtype='object')

In [16]:
df.head(2)

Unnamed: 0,Cutting speed vc [m/min],Spindle speed n [1/min],Feed f [mm/rev],Feed rate vf [mm/min],Power Pc [kW],Cooling [%],Process Time [sec],Main Failure,Material_K,Material_N,Material_P,Drill Bit Type_H,Drill Bit Type_N,Drill Bit Type_W
0,29.92,794,0.218,173,194.87,75,19.26,0,False,True,False,False,False,True
1,29.85,792,0.241,191,214.71,75,19.55,0,False,True,False,False,False,True


In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [21]:
X = df.drop(columns=["Main Failure"])
y = df["Main Failure" ]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3, random_state=12)


In [23]:
X_train.shape

(14000, 13)

In [24]:
y_train.value_counts()

Main Failure
0    13285
1      715
Name: count, dtype: int64

In [25]:
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train,
y_train)

In [26]:
y_train_oversampled.value_counts()

Main Failure
0    13285
1    13285
Name: count, dtype: int64

In [27]:
scaler=StandardScaler()
X_Trained_oversampled_scaled =scaler.fit_transform(X_train_oversampled)
X_test_scaled = scaler.transform(X_test)

In [28]:
X_Trained_oversampled_scaled

array([[ 1.23846823,  1.23740475,  0.42316928, ..., -0.66064299,
        -0.73881387,  1.38382633],
       [ 1.20933934,  1.20812195,  0.54756282, ..., -0.66064299,
        -0.73881387,  1.38382633],
       [ 1.19768778,  1.20080125, -0.12416225, ..., -0.66064299,
        -0.73881387,  1.38382633],
       ...,
       [-1.32099037, -1.32483985, -0.2983132 , ...,  1.51367686,
        -0.73881387, -0.72263403],
       [ 1.23846823,  1.23740475,  0.32365446, ...,  1.51367686,
        -0.73881387, -0.72263403],
       [-1.23748755, -1.23699146,  0.14950352, ...,  1.51367686,
        -0.73881387, -0.72263403]])

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [30]:
model = LogisticRegression()

In [37]:
model.fit(X_Trained_oversampled_scaled, y_train_oversampled)

# Make predictions on the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f" accuracy = {accuracy} and precision={precision} and recall = {recall} and f1 = {f1}")

 accuracy = 0.9343333333333333 and precision=0.40217391304347827 and recall = 0.7762237762237763 and f1 = 0.5298329355608592
