### Import

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [3]:
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
FILE_PATH = '../data/predictive_maintenance_A.csv'

### Read file data

In [4]:
df = pd.read_csv(FILE_PATH, index_col='UDI')
df.head()

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Difference temperature [K],Power
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,M,298.1,308.6,1551,42.8,0,0,No Failure,10.5,66382.8
2,L,298.2,308.7,1408,46.3,3,0,No Failure,10.5,65190.4
3,L,298.1,308.5,1498,49.4,5,0,No Failure,10.4,74001.2
4,L,298.2,308.6,1433,39.5,7,0,No Failure,10.4,56603.5
5,L,298.2,308.7,1408,40.0,9,0,No Failure,10.5,56320.0


Delete rows with 'Random Failures' in type column with target == 0

In [5]:
df_tmp = df[df['Target']==0]
df_tmp['Failure Type'].value_counts()

Failure Type
No Failure         9643
Random Failures      18
Name: count, dtype: int64

In [6]:
idx = df_tmp[df_tmp['Failure Type'] == 'Random Failures'].index
df.drop(index=idx, axis=0, inplace=True)

Delete rows with 'No Failure' in type column with target == 1

In [7]:
df_tmp = df[df['Target']==1]
df_tmp['Failure Type'].value_counts()

Failure Type
Heat Dissipation Failure    112
Power Failure                95
Overstrain Failure           78
Tool Wear Failure            45
No Failure                    9
Name: count, dtype: int64

In [8]:
idx = df_tmp[df_tmp['Failure Type']=='No Failure'].index
df.drop(index=idx, axis=0, inplace=True)

In [9]:
df.shape

(9973, 10)

# Data Formatting
## Conversion 'Failure Type' variable to discret value

In [12]:
lst_failure = df['Failure Type'].unique()
dict_failure = {}
for i, failure in enumerate(lst_failure):
    dict_failure[failure] = i
df['Failure Type'].map(dict_failure)  

UDI
1        0
2        0
3        0
4        0
5        0
        ..
9996     0
9997     0
9998     0
9999     0
10000    0
Name: Failure Type, Length: 9973, dtype: int64

## List of categorical variables
Memorize features who are 'object' type, exclude target variables 

In [10]:
cat_cols = df.select_dtypes(include='O').columns.to_list()
cat_cols.remove('Failure Type')
#cat_cols.append('Target')
cat_cols

['Type']

## List of numerical variables
Memorize features who are 'numerical' type, exclude target variables 

In [11]:
num_cols = df.select_dtypes(exclude='O').columns.to_list()
num_cols.remove('Target')
num_cols

['Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'Difference temperature [K]',
 'Power']

## Preprocessing of variables

#### Pipeline creation

Define different preprocessing to dataset
1. OnehotEncoder process from list of variables in 'cat_cols'
2. StandardScaler process from list of variables in 'num_cols'
3. OrdinalEncoder process for 'Failure Type' variables

In [15]:
ct = ColumnTransformer([
    ("onehot", OneHotEncoder(), cat_cols),
    ("scale", StandardScaler(), num_cols),
    ("fail_type", OrdinalEncoder(), ['Failure Type'])
])


Apply preprocess transformer and create dataframe 'df_preprocess' 

In [16]:
df_preprocess = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())

df_preprocess.rename(columns={'fail_type__Failure Type': 'Target'}, inplace=True)
df_preprocess.dropna(axis=0, inplace=True)

In [17]:
df_preprocess .to_csv('../data/predictive_maintenance_preprocess.csv')