### Import

In [34]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


In [35]:
import warnings

warnings.filterwarnings(action='ignore')

In [36]:
FILE_PATH = '../data/predictive_maintenance_A.csv'

### Read file data

In [37]:
df = pd.read_csv(FILE_PATH, index_col='UDI')
df.head()

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type,Difference temperature [K],Power
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,M,298.1,308.6,1551,42.8,0,0,No Failure,10.5,66382.8
2,L,298.2,308.7,1408,46.3,3,0,No Failure,10.5,65190.4
3,L,298.1,308.5,1498,49.4,5,0,No Failure,10.4,74001.2
4,L,298.2,308.6,1433,39.5,7,0,No Failure,10.4,56603.5
5,L,298.2,308.7,1408,40.0,9,0,No Failure,10.5,56320.0


Delete rows with 'Random Failures' in type column with target == 0

In [38]:
df_tmp = df[df['Target']==0]
df_tmp['Failure Type'].value_counts()

No Failure         9643
Random Failures      18
Name: Failure Type, dtype: int64

In [39]:
idx = df_tmp[df_tmp['Failure Type'] == 'Random Failures'].index
df.drop(index=idx, axis=0, inplace=True)

Delete rows with 'No Failure' in type column with target == 1

In [40]:
df_tmp = df[df['Target']==1]
df_tmp['Failure Type'].value_counts()

Heat Dissipation Failure    112
Power Failure                95
Overstrain Failure           78
Tool Wear Failure            45
No Failure                    9
Name: Failure Type, dtype: int64

In [41]:
idx = df_tmp[df_tmp['Failure Type']=='No Failure'].index
df.drop(index=idx, axis=0, inplace=True)

In [42]:
df.shape

(9973, 10)

## List of categorical variables
Delete the 'Failure Type' variable

In [43]:
cat_cols = df.select_dtypes(include='O').columns.to_list()
cat_cols.remove('Failure Type')
cat_cols

['Type']

## List of numerical variables

In [44]:
num_cols = df.select_dtypes(exclude='O').columns.to_list()
num_cols

['Air temperature [K]',
 'Process temperature [K]',
 'Rotational speed [rpm]',
 'Torque [Nm]',
 'Tool wear [min]',
 'Target',
 'Difference temperature [K]',
 'Power']

## Preprocessing of variables

#### Pipeline creation

Define different preprocessing to dataset
1. OnehotEncoder process from list of variables in 'cat_cols'
2. StandardScaler process from list of variables in 'num_cols'

In [45]:
ct = ColumnTransformer([
    ("onehot", OneHotEncoder(), cat_cols),
    ("scale", StandardScaler(), num_cols)
])

Apply preprocess transformer and create dataframe 'df_preprocess' 

In [46]:
df_preprocess = pd.DataFrame(ct.fit_transform(df), columns=ct.get_feature_names_out())


Extract the name of variable of 'Target'

In [47]:
name_target_col = list(filter(lambda ligne: 'Target' in ligne, df_preprocess.columns))

Separation of explanatory variables and variables to identify

In [48]:
x = df_preprocess.drop(columns=name_target_col)
y = df_preprocess[name_target_col]

In [49]:

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)