In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sea
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# Read Dataset
preMain = pd.read_csv('/kaggle/input/predictive-maintenance/ai4i2020.csv')
preMain.head()

In [4]:
preMain.info()

We have **2 objects** (Product ID, Type), and **3 floats** (Ait Temp, Process Temp, Torque), and **int** for the others.

In [5]:
preMain['Type'].value_counts()

Good, Type looks like -->
**Low:** 60%, **medium:** ~30%, **high:**  ~10% as product quality variants

In [6]:
preMain.describe()

In [7]:
# check if there is same product maybe repeat along dataset
preMain['Product ID'].nunique()

In [8]:
# Check nulls
preMain.isnull().sum()

In [9]:
# check if there is same mode failures maybe repeats along dataset
modes = preMain.columns[preMain.columns.isin(['TWF','OSF'])] # we can also try with PWF/RNF

print (preMain[(preMain[modes] == 1).all(1)])

That's right the mode can be repeats but the machine failure will be one with any/multi modes

## Look at some Graphs and Distributions

In [10]:
preMain['Air temperature [K]'].hist(bins= 20, figsize= (10,5))
plt.show()

In [11]:
preMain['Process temperature [K]'].hist(bins= 20, figsize= (10,5))
plt.show()

In [12]:
preMain['Rotational speed [rpm]'].hist(bins= 20, figsize= (10,5))
plt.show()

Overlaid with a normally distributed noise

In [13]:
preMain['Torque [Nm]'].hist(bins= 20, figsize= (10,5))
plt.show()

Right this is Normally distributed

In [14]:
preMain['Tool wear [min]'].hist(bins= 50, figsize= (10,5))
plt.show()

## Look at some Relationships(for failure modes)

## TWF

In [15]:

sea.displot(preMain, x ="Tool wear [min]", hue= 'TWF', multiple="stack");

If Tool wear [min] between 200 – 240, **TWF** replace or fail

## HDF

In [16]:
preMain['diffTemp'] = preMain['Air temperature [K]'] - preMain['Process temperature [K]']

In [17]:
sea.jointplot(data=preMain, x="diffTemp", y="Rotational speed [rpm]", hue='HDF');

If the difference between air-and process temperature is below 8.6 K and the tool’s rotational speed is below 1380 rpm, **HDF** occurs

## PWF

In [18]:
preMain['TorqRot'] = preMain['Torque [Nm]'] * preMain['Rotational speed [rpm]']

In [19]:
sea.displot(preMain, x= 'TorqRot', hue= 'PWF', element="step");

If the power Torque [Nm] * Rotational speed [rpm] is below 3500 W or above 9000 W **PWF** occurs

## OSF

In [20]:
preMain['toolTorq'] = preMain['Torque [Nm]'] * preMain['Tool wear [min]']

In [21]:
sea.stripplot(y="toolTorq", x= 'Type', hue="OSF", data = preMain);

If the product of tool wear and torque exceeds 11,000 minNm for the L product variant (12,000 for M, 13,000 for H) **OSF** occurs

## RNF
more frequent than could be expected for 10,000 datapoints in our dataset.

### Clean last 3 columns we are added for visualization

In [22]:
preMain.head()

In [23]:
preMain.drop(preMain.columns[14:], axis=1, inplace=True)
preMain.head()

# Some Explanations from our analysis :
- **Product ID** *product quality* variants and a variant-specific serial number
- **Type** Product Quality [L, M, H]
- **Air temperature [K]**  normalized to a standard deviation of 2 K around 300 K
- **Process temperature [K]** normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
- **Rotational speed [rpm]** calculated from a power of 2860 W, overlaid with a normally distributed noise
- **Torque [Nm]** normally distributed around 40 Nm with a σ = 10 Nm and *no negative values*
- **Tool wear [min]** The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process.
- **Machine failure** indicates whether the machine has failed in this particular datapoint, consists of five independent failure modes:
- **TWF** (tool wear failure): the tool will be replaced or fail at a randomly selected tool wear time between 200 – 240 mins (**120** times in our dataset). At this point in time, the tool is **replaced 74** times, and **fails 46** times (randomly assigned).
- **HDF** (heat dissipation failure) heat dissipation causes a process failure, if the difference between air-and process temperature is below 8.6 K and the tool’s rotational speed is below 1380 rpm. This is the case for **115** data points
- **PWF** (power failure) the product of torque and rotational speed (in rad/s) equals the power required for the process. If this power is below 3500 W or above 9000 W, the process fails, which is the case **95** times in our dataset.
- **OSF** (overstrain failure) if the product of tool wear and torque exceeds 11,000 minNm for the L product variant (12,000 for M, 13,000 for H), the process fails due to overstrain. This is true for **98** datapoints
- **RNF** (random failure) each process has a chance of 0,1 % to fail regardless of its process parameters. This is the case for **19** datapoints, more frequent than *could be expected* for 10,000 datapoints in our dataset.

## Data Preparation

In [24]:
y = preMain['Machine failure']
x = preMain.drop(['Machine failure'], axis = 1)

varCats = preMain['Type']
varNums = preMain[['Air temperature [K]', 'Process temperature [K]', 
                   'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']]


### Normalization operation for numerical stability

In [25]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x  = std.fit_transform(varNums)

In [28]:
from sklearn.preprocessing import OrdinalEncoder
enco = OrdinalEncoder()
preMain['Type']= enco.fit_transform(preMain['Type'].values.reshape(-1, 1))

In [33]:
preMain['Type'].unique()
sea.histplot(x="Type", hue="Machine failure", data=preMain);

### Now: L(Low) = 1, M(Medium) = 2, H(High) = 0

In [34]:
# Split our data to train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=84)

In [36]:
X_test

In [38]:
X_train

## Modeling

In [39]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression() 
logReg.fit(X_train, y_train)

In [40]:
# Make predictions 
pred = logReg.predict(X_test)

###  Calculate the accuracy score by comparing the actual values and predicted values.

In [42]:
from sklearn.metrics import confusion_matrix
cnfMax = confusion_matrix(y_test, pred)

TN, FP, FN, TP = confusion_matrix(y_test, pred).ravel()

print('True Positive(TP) = ', TP)
print('False Positive(FP) = ', FP)
print('True Negative(TN) = ', TN)
print('False Negative(FN) = ', FN)

accuracy = (TP+TN) / (TP+FP+TN+FN)

print('Accuracy of binary classification = {:0.3f}'.format(accuracy))

# Well(97%), I think this good for now :)