## Random Forest Classifier

### Setup and Data Preprocessing

In [93]:
import warnings; warnings.filterwarnings('ignore')
import sys; import os; sys.stderr = open(os.devnull, 'w')
import numpy as np; import pandas as pd; from matplotlib import pyplot as plt; import seaborn as sns  # Data processing and visualisation
from sklearn.model_selection import train_test_split  # Train-test split
from sklearn.tree import DecisionTreeClassifier  # Decision tree classifier
from sklearn.ensemble import RandomForestClassifier  # Random forest classifier
from sklearn.model_selection import GridSearchCV  # Cross-validation
from sklearn.metrics import accuracy_score, recall_score, f1_score  # Classification performance metrics

In [94]:
data = pd.read_csv('machine_maintenance.csv', index_col = 'Product ID'); data.head()

Unnamed: 0_level_0,Type,Air Temp [K],Process Temp [K],Rotational Speed [rpm],Torque [Nm],Tool wear [min],Failure,Failure Type
Product ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
L47184,L,298.2,308.7,1408,40.0,9,0,No Failure


In [95]:
data.shape

(10000, 8)

There are 10000 observations and 8 features. Of these 8 features, 6 are predictors and 2 are targets (`'Failure'` and `'Failure Type'`).

In [96]:
data['Failure'].value_counts()

Failure
0    9661
1     339
Name: count, dtype: int64

In [97]:
data['Failure Type'].value_counts()

Failure Type
No Failure                  9652
Heat Dissipation Failure     112
Power Failure                 95
Overstrain Failure            78
Tool Wear Failure             45
Random Failures               18
Name: count, dtype: int64

Both `'Failure'` (binary) and `'Failure Type'` (multiclass) show very high class imbalance

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, M14860 to M24859
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Type                    10000 non-null  object 
 1   Air Temp [K]            10000 non-null  float64
 2   Process Temp [K]        10000 non-null  float64
 3   Rotational Speed [rpm]  10000 non-null  int64  
 4   Torque [Nm]             10000 non-null  float64
 5   Tool wear [min]         10000 non-null  int64  
 6   Failure                 10000 non-null  int64  
 7   Failure Type            10000 non-null  object 
dtypes: float64(3), int64(3), object(2)
memory usage: 703.1+ KB


There are no missing values in the dataset

In [99]:
data[['Type', 'Failure Type']] = data[['Type', 'Failure Type']].astype('category')  # Convert strings to categories
df = pd.get_dummies(data, columns = ['Type'])  # One-hot encoding the categorical predictor

In [100]:
# Split into predictors and targets
X = df.drop(['Failure', 'Failure Type'], axis = 1)  # Predictors
y_b = df['Failure']  # Binary target

In [101]:
# Split into train-validation
X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(X, y_b, test_size = 0.4, random_state = 0, stratify = y_b)
print('Training data size = {}\nValidation data size = {}'.format(len(y_train_b), len(y_val_b)))

Training data size = 6000
Validation data size = 4000


### Decision Tree - Binary Case

In [102]:
dt_b = DecisionTreeClassifier(); dt_b.fit(X_train_b, y_train_b)  # Fitting a decision tree for binary case

In [103]:
print('Number of leaves = {}\nTree depth = {}'.format(dt_b.get_n_leaves(), dt_b.get_depth()))  # Tree characteristics

Number of leaves = 119
Tree depth = 17


In [104]:
y_train_pred_b = dt_b.predict(X_train_b); y_val_pred_b = dt_b.predict(X_val_b)  # Predictions

In [105]:
# Accuracy score
print('Training accuracy = {}\nValidation accuracy = {}'.format(np.round(accuracy_score(y_train_pred_b, y_train_b), 2), np.round(accuracy_score(y_val_pred_b, y_val_b), 2)))

Training accuracy = 1.0
Validation accuracy = 0.98


Let's look at specific classes of interest in each case. For the binary case, we will take machine failure `1` as the class of interest.

In [106]:
# Sensitivity and specificity scores
print('Training sensitivity = {}\nValidation sensitivity = {}\n'.format(np.round(recall_score(y_train_b, y_train_pred_b, pos_label = 1), 2), np.round(recall_score(y_val_b, y_val_pred_b, pos_label = 1), 2)))
print('Training specificity = {}\nValidation specificity = {}\n'.format(np.round(recall_score(y_train_b, y_train_pred_b, pos_label = 0), 2), np.round(recall_score(y_val_b, y_val_pred_b, pos_label = 0), 2)))
print('Training F1-score = {}\nValidation F1-score = {}\n'.format(np.round(f1_score(y_train_b, y_train_pred_b, pos_label = 1), 2), np.round(f1_score(y_val_b, y_val_pred_b, pos_label = 1), 2)))
print('Training recall [average] = {}\nValidation recall [average] = {}'.format(np.round(recall_score(y_train_b, y_train_pred_b, average = 'weighted'), 2), np.round(recall_score(y_val_b, y_val_pred_b, average = 'weighted'), 2)))

Training sensitivity = 1.0
Validation sensitivity = 0.56

Training specificity = 1.0
Validation specificity = 0.99

Training F1-score = 1.0
Validation F1-score = 0.6

Training recall [average] = 1.0
Validation recall [average] = 0.98


The tree is overfitting with a clear bias towards the negative class

In [108]:
# Storing metrics for later comparison
acc_dt_b_train = accuracy_score(y_train_pred_b, y_train_b)
acc_dt_b_val = accuracy_score(y_val_pred_b, y_val_b)
sens_dt_b_train = recall_score(y_train_b, y_train_pred_b, pos_label = 1)
sens_dt_b_val = recall_score(y_val_b, y_val_pred_b, pos_label = 1)
spec_dt_b_train = recall_score(y_train_b, y_train_pred_b, pos_label = 0)
spec_dt_b_val = recall_score(y_val_b, y_val_pred_b, pos_label = 0)
f1_dt_b_train = f1_score(y_train_b, y_train_pred_b, pos_label = 1)
f1_dt_b_val = f1_score(y_val_b, y_val_pred_b, pos_label = 1)
avg_rec_dt_b_train = recall_score(y_train_b, y_train_pred_b, average = 'weighted')
avg_rec_dt_b_val = recall_score(y_val_b, y_val_pred_b, average = 'weighted')

### Random Forest - Binary Case

In [109]:
rf_b = RandomForestClassifier(random_state = 0, max_samples = 0.75, oob_score = True); rf_b = rf_b.fit(X_train_b, y_train_b)  # Fitting a random forest for binary case

There are a variety of attributes of the trained object that we can look at. For example, we can see the number of estimators, the feature importances, and so on.

In [110]:
print('Number of estimators = {}'.format(rf_b.n_estimators))

Number of estimators = 100


In [111]:
pd.DataFrame({'Feature': X.columns, 'Importance': rf_b.feature_importances_}).set_index('Feature').sort_values(by = 'Importance', ascending = False)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
Torque [Nm],0.307268
Rotational Speed [rpm],0.253707
Tool wear [min],0.169139
Air Temp [K],0.137594
Process Temp [K],0.105707
Type_M,0.01092
Type_L,0.01006
Type_H,0.005606


In [112]:
print('OOB Accuracy = {}'.format(np.round(rf_b.oob_score_, 2)))  # OOB score returns accuracy by default

OOB Accuracy = 0.98


In [113]:
# Performance metrics
y_train_pred_b = rf_b.predict(X_train_b); y_val_pred_b = rf_b.predict(X_val_b)  # Predictions
print('Training accuracy = {}\nValidation accuracy = {}\n'.format(np.round(accuracy_score(y_train_pred_b, y_train_b), 2), np.round(accuracy_score(y_val_pred_b, y_val_b), 2)))
print('Training sensitivity = {}\nValidation sensitivity = {}\n'.format(np.round(recall_score(y_train_b, y_train_pred_b, pos_label = 1), 2), np.round(recall_score(y_val_b, y_val_pred_b, pos_label = 1), 2)))
print('Training specificity = {}\nValidation specificity = {}\n'.format(np.round(recall_score(y_train_b, y_train_pred_b, pos_label = 0), 2), np.round(recall_score(y_val_b, y_val_pred_b, pos_label = 0), 2)))
print('Training F1-score = {}\nValidation F1-score = {}\n'.format(np.round(f1_score(y_train_b, y_train_pred_b, pos_label = 1), 2), np.round(f1_score(y_val_b, y_val_pred_b, pos_label = 1), 2)))
print('Training recall [average] = {}\nValidation recall [average] = {}'.format(np.round(recall_score(y_train_b, y_train_pred_b, average = 'weighted'), 2), np.round(recall_score(y_val_b, y_val_pred_b, average = 'weighted'), 2)))

Training accuracy = 1.0
Validation accuracy = 0.98

Training sensitivity = 0.98
Validation sensitivity = 0.41

Training specificity = 1.0
Validation specificity = 1.0

Training F1-score = 0.99
Validation F1-score = 0.56

Training recall [average] = 1.0
Validation recall [average] = 0.98


The random forest clearly needs tuning. We will do this using `GridSearchCV`.

### Random Forest Tuning - Binary Case

In [114]:
rf_b = RandomForestClassifier(random_state = 0, max_samples = 0.75)
param_grid = {'n_estimators': [100, 250], 'max_depth': [15, 20], 'max_features': [4, 5]}
grid_rf = GridSearchCV(rf_b, param_grid, cv = 5, scoring = 'f1', verbose = 5)
grid_rf_b = grid_rf.fit(X_train_b, y_train_b)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.743 total time=   0.6s
[CV 2/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.783 total time=   0.6s
[CV 3/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.646 total time=   0.5s
[CV 4/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.727 total time=   0.5s
[CV 5/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.789 total time=   0.5s
[CV 1/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.750 total time=   1.3s
[CV 2/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.806 total time=   1.3s
[CV 3/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.697 total time=   1.4s
[CV 4/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.746 total time=   1.3s
[CV 5/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.754 total time=   1.5s
[CV 1/5] E

The grid model has already refit the optimal model in `grid_rf_b`

In [115]:
print('Optimal number of trees = ', grid_rf_b.best_params_['n_estimators'])
print('Optimal tree depth = ', grid_rf_b.best_params_['max_depth'])
print('Optimal number of features = ', grid_rf_b.best_params_['max_features'])

Optimal number of trees =  250
Optimal tree depth =  20
Optimal number of features =  4


In [116]:
# Performance metrics
y_train_pred_b = grid_rf_b.predict(X_train_b); y_val_pred_b = grid_rf_b.predict(X_val_b)  # Predictions
print('Training accuracy = {}\nValidation accuracy = {}\n'.format(np.round(accuracy_score(y_train_pred_b, y_train_b), 2), np.round(accuracy_score(y_val_pred_b, y_val_b), 2)))
print('Training sensitivity = {}\nValidation sensitivity = {}\n'.format(np.round(recall_score(y_train_b, y_train_pred_b, pos_label = 1), 2), np.round(recall_score(y_val_b, y_val_pred_b, pos_label = 1), 2)))
print('Training specificity = {}\nValidation specificity = {}\n'.format(np.round(recall_score(y_train_b, y_train_pred_b, pos_label = 0), 2), np.round(recall_score(y_val_b, y_val_pred_b, pos_label = 0), 2)))
print('Training F1-score = {}\nValidation F1-score = {}\n'.format(np.round(f1_score(y_train_b, y_train_pred_b, pos_label = 1), 2), np.round(f1_score(y_val_b, y_val_pred_b, pos_label = 1), 2)))
print('Training recall [average] = {}\nValidation recall [average] = {}'.format(np.round(recall_score(y_train_b, y_train_pred_b, average = 'weighted'), 2), np.round(recall_score(y_val_b, y_val_pred_b, average = 'weighted'), 2)))

Training accuracy = 1.0
Validation accuracy = 0.98

Training sensitivity = 1.0
Validation sensitivity = 0.6

Training specificity = 1.0
Validation specificity = 1.0

Training F1-score = 1.0
Validation F1-score = 0.71

Training recall [average] = 1.0
Validation recall [average] = 0.98


In [117]:
# Metrics for tuned random forest
acc_rf_b_train = accuracy_score(y_train_pred_b, y_train_b)
acc_rf_b_val = accuracy_score(y_val_pred_b, y_val_b)
sens_rf_b_train = recall_score(y_train_b, y_train_pred_b, pos_label = 1)
sens_rf_b_val = recall_score(y_val_b, y_val_pred_b, pos_label = 1)
spec_rf_b_train = recall_score(y_train_b, y_train_pred_b, pos_label = 0)
spec_rf_b_val = recall_score(y_val_b, y_val_pred_b, pos_label = 0)
f1_rf_b_train = f1_score(y_train_b, y_train_pred_b, pos_label = 1)
f1_rf_b_val = f1_score(y_val_b, y_val_pred_b, pos_label = 1)
avg_rec_rf_b_train = recall_score(y_train_b, y_train_pred_b, average = 'weighted')
avg_rec_rf_b_val = recall_score(y_val_b, y_val_pred_b, average = 'weighted')

In [119]:
# Comparing performance
metrics_comparison = {'Train Acc': [acc_dt_b_train, acc_rf_b_train],
                      'Val Acc': [acc_dt_b_val, acc_rf_b_val],
                      'Train Sens': [sens_dt_b_train, sens_rf_b_train],
                      'Val sens': [sens_dt_b_val, sens_rf_b_val],
                      'Train Spec': [spec_dt_b_train, spec_rf_b_train],
                      'Val Spec': [spec_dt_b_val, spec_rf_b_val],
                      'Train F1': [f1_dt_b_train, f1_rf_b_train],
                      'Val F1': [f1_dt_b_val, f1_rf_b_val],
                      'Train Rec [avg]': [avg_rec_dt_b_train, avg_rec_rf_b_train],
                      'Val Rec [avg]': [avg_rec_dt_b_val, avg_rec_rf_b_val]}
df_metrics = pd.DataFrame(metrics_comparison, index = ['dt_b', 'rf_b']); df_metrics.round(2)

Unnamed: 0,Train Acc,Val Acc,Train Sens,Val sens,Train Spec,Val Spec,Train F1,Val F1,Train Rec [avg],Val Rec [avg]
dt_b,1.0,0.98,1.0,0.56,1.0,0.99,1.0,0.6,1.0,0.98
rf_b,1.0,0.98,1.0,0.6,1.0,1.0,1.0,0.71,1.0,0.98


The tuned random forest model is better than the decision tree model and we were able to generalise better, especially for the class of interest, that is machine failures

### Decision Tree - Multiclass Case

In [29]:
# Setting up predictors, output, and train-test split for multiclass case
y_m = df['Failure']
X_train_m, X_val_m, y_train_m, y_val_m = train_test_split(X, y_m, test_size = 0.4, random_state = 0, stratify = y_m)

In [30]:
dt_m = DecisionTreeClassifier(); dt_m.fit(X_train_m, y_train_m)  # Fitting a decision tree for multiclass case

In [31]:
# Performance metrics
y_train_pred_m = dt_m.predict(X_train_b); y_val_pred_m = dt_m.predict(X_val_b)  # Predictions
print('Training accuracy = {}\nValidation accuracy = {}\n'.format(np.round(accuracy_score(y_train_pred_m, y_train_m), 2), np.round(accuracy_score(y_val_pred_m, y_val_m), 2)))
print('Training recall [average] = {}\nValidation recall [average] = {}\n'.format(np.round(recall_score(y_train_m, y_train_pred_m, average = 'weighted'), 2), np.round(recall_score(y_val_m, y_val_pred_m, average = 'weighted'), 2)))

Training accuracy = 1.0
Validation accuracy = 0.97

Training recall [average] = 1.0
Validation recall [average] = 0.97



### Random Forest Tuning - Multiclass Case

In [24]:
rf_m = RandomForestClassifier(random_state = 0, max_samples = 0.75)
param_grid = {'n_estimators': [100, 250], 'max_depth': [15, 20], 'max_features': [4, 5]}
grid_rf = GridSearchCV(rf_m, param_grid, cv = 5, scoring = 'f1', verbose = 5)
grid_rf_m = grid_rf.fit(X_train_m, y_train_m)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.650 total time=   0.5s
[CV 2/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.675 total time=   0.5s
[CV 3/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.512 total time=   0.6s
[CV 4/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.585 total time=   0.5s
[CV 5/5] END max_depth=15, max_features=4, n_estimators=100;, score=0.683 total time=   0.6s
[CV 1/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.675 total time=   1.5s
[CV 2/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.725 total time=   1.4s
[CV 3/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.561 total time=   1.4s
[CV 4/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.610 total time=   1.5s
[CV 5/5] END max_depth=15, max_features=4, n_estimators=250;, score=0.634 total time=   1.3s
[CV 1/5] E

In [25]:
# Performance metrics
y_train_pred_m = grid_rf_m.predict(X_train_m); y_val_pred_m = grid_rf_m.predict(X_val_m)  # Predictions
print('Training accuracy = {}\nValidation accuracy = {}\n'.format(np.round(accuracy_score(y_train_pred_m, y_train_m), 2), np.round(accuracy_score(y_val_pred_m, y_val_m), 2)))
print('Training recall [average] = {}\nValidation recall [average] = {}\n'.format(np.round(recall_score(y_train_m, y_train_pred_m, average = 'weighted'), 2), np.round(recall_score(y_val_m, y_val_pred_m, average = 'weighted'), 2)))

Training accuracy = 1.0
Validation accuracy = 0.98

Training recall [average] = 1.0
Validation recall [average] = 0.98

