# Models : SGD, Random forest, Naive Bayes

In [49]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sea
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [2]:
##initilize seed and random values 

seed_value = 44
os.environ['PYTHONHASHSEED']=str(seed_value)

# Set `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)

# Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

In [3]:
data_file = 'ai4i2020.csv'  # modify the name of the data file as required  
data = pd.read_csv(data_file )
data

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


## Split our dataset

In [4]:
train_data,test_data = train_test_split(data, test_size = 0.33, random_state = seed_value)

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6700 entries, 4330 to 3491
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      6700 non-null   int64  
 1   Product ID               6700 non-null   object 
 2   Type                     6700 non-null   object 
 3   Air temperature [K]      6700 non-null   float64
 4   Process temperature [K]  6700 non-null   float64
 5   Rotational speed [rpm]   6700 non-null   int64  
 6   Torque [Nm]              6700 non-null   float64
 7   Tool wear [min]          6700 non-null   int64  
 8   Machine failure          6700 non-null   int64  
 9   TWF                      6700 non-null   int64  
 10  HDF                      6700 non-null   int64  
 11  PWF                      6700 non-null   int64  
 12  OSF                      6700 non-null   int64  
 13  RNF                      6700 non-null   int64  
dtypes: float64(3), int64(

## look at our split dataset 

In [6]:
test_data.head()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
3472,3473,L50652,L,301.8,310.7,1523,43.3,60,0,0,0,0,0,0
5095,5096,M19955,M,304.1,313.5,1428,39.9,88,0,0,0,0,0,0
9504,9505,L56684,L,299.2,310.3,1440,47.0,21,0,0,0,0,0,0
5786,5787,L52966,L,301.7,311.2,1521,36.7,130,0,0,0,0,0,0
8758,8759,M23618,M,297.5,308.7,1446,53.0,168,0,0,0,0,0,0


In [8]:
# Split to X_train, y_train
X_train = pd.DataFrame(train_data.drop(['Product ID', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1)) 
y_train = train_data[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']]

In [9]:
X_train.shape, y_train.shape

((6700, 7), (6700, 5))

In [11]:
X_test = pd.DataFrame(test_data.drop(['Product ID', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], axis = 1 )) 
y_test = test_data[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']]

In [12]:
X_test.shape, y_test.shape

((3300, 7), (3300, 5))

## Normalization operation for numerical stability
### Encoding Type column

In [13]:
enco = LabelEncoder()
X_train['Type'] = enco.fit_transform(X_train['Type'].values.reshape(-1, 1).ravel())
X_test['Type'] = enco.transform(X_test['Type'].values.reshape(-1, 1).ravel())

In [14]:
std = StandardScaler()
X_train = std.fit_transform(X_train)
X_test = std.transform(X_test)

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modeling

## SGD

In [17]:
sgd_clf = MultiOutputClassifier(SGDClassifier(random_state=seed_value, loss= 'log'))
sgd_clf.fit(X_train, y_train)

MultiOutputClassifier(estimator=SGDClassifier(loss='log', random_state=44))

In [18]:
sgd_pred = sgd_clf.predict(X_test)

In [19]:
accuracy = accuracy_score(y_test, sgd_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))

Accuracy: 97


In [20]:
cross_val_score(sgd_clf, X_train, y_train, cv = 3, scoring= 'accuracy')

array([0.96195166, 0.96506941, 0.96775638])

In [57]:
roc_auc_score(y_test, sgd_pred)

0.5

## Random forest

In [106]:
RF_clf = MultiOutputClassifier(RandomForestClassifier(random_state=seed_value))
RF_clf.fit(X_train, y_train)

MultiOutputClassifier(estimator=RandomForestClassifier(random_state=44))

In [108]:
RF_pred = RF_clf.predict(X_test)

In [110]:
accuracy = accuracy_score(y_test, RF_pred)
print('Accuracy: {:.2%}'.format(accuracy))

cross_val_score(RF_clf, X_train, y_train, cv = 3, scoring= 'accuracy')

Accuracy: 98.24%


array([0.97806625, 0.98074339, 0.98119122])

### Improve RandomForest

In [26]:
# Define parameter possibilities as lists
n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None]

In [27]:
from sklearn.model_selection import RandomizedSearchCV

# Create the random grid
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features}

In [28]:
# Use the hyperparameter_grid to search for best hyperparameters
RF_search = RandomizedSearchCV(estimator = RF_clf, param_distributions = hyperparameter_grid,
                               n_iter = 10, cv = 3, random_state= seed_value)
# Fit the random search model
RF_search.fit(X_train, y_train)

RF_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 6,
 'max_features': None}

In [32]:
# Tune best parameters on new model.
RF_clf_tune = MultiOutputClassifier (RandomForestClassifier(n_estimators= 1000, min_samples_split= 9,
                                     min_samples_leaf=  6,
                                     max_features= None,
                                     random_state= seed_value))

RF_clf_tune.fit(X_train, y_train)

RF_pred_tune = RF_clf_tune.predict(X_test)

accuracy = accuracy_score(y_test, RF_pred_tune)
print('Accuracy: {:.2%}'.format(accuracy))

cross_val_score(RF_clf_tune, X_train, y_train, cv = 3, scoring= 'accuracy')

Accuracy: 98.48%


array([0.97985676, 0.98611733, 0.98029557])

In [87]:
# F1-Score
f1 = f1_score(y_test, RF_pred_tune, average= 'weighted')
print('F1: {}'.format(round(f1*100), 2))

F1: 71


In [88]:
roc_auc_score(y_test, RF_pred_tune)

0.7444582410221187

## Naive Bayes [ Gaussian | Multinominal ]

## Gaussian NB

In [96]:
from sklearn.naive_bayes import GaussianNB
gnb_clf = MultiOutputClassifier(GaussianNB())
gnb_clf.fit(X_train, y_train)

MultiOutputClassifier(estimator=GaussianNB())

In [97]:
gnb_pred = gnb_clf.predict(X_test)

In [98]:
# Accuracy and cross validation
accuracy = accuracy_score(y_test, gnb_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))
cross_val_score(gnb_clf, X_train, y_train, cv = 3, scoring= 'accuracy')

Accuracy: 95


array([0.87466428, 0.96148679, 0.95745634])

In [99]:
# F1-Score
f1 = f1_score(y_test, gnb_pred, average= 'weighted')
print('F1: {}'.format(round(f1*100), 2))

F1: 44


In [100]:
roc_auc_score(y_test, gnb_pred)

0.7085523984144199

## Multinominal NB

In [101]:
from sklearn.naive_bayes import MultinomialNB
MNB_clf = MultiOutputClassifier(MultinomialNB())
MNB_clf.fit(X_train, y_train)

MultiOutputClassifier(estimator=MultinomialNB())

In [102]:
MNB_pred = MNB_clf.predict(X_test)

In [103]:
# Accuracy and cross validation
accuracy = accuracy_score(y_test, MNB_pred)
print('Accuracy: {}'.format(round(accuracy*100), 2))
cross_val_score(MNB_clf, X_train, y_train, cv = 3, scoring= 'accuracy')

Accuracy: 97


array([0.96195166, 0.96462159, 0.96730855])

In [104]:
# F1-Score
f1 = f1_score(y_test, MNB_pred, average= 'weighted')
print('F1: {}'.format(round(f1*100), 2))

F1: 0


In [105]:
roc_auc_score(y_test, MNB_pred)

0.5