### CONSTANTS

In [3]:
DATA_PATH = "data/data_feature_engineered.csv"
NUMERICAL_FEATURES = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]',
                       'Torque [Nm]', 'Tool wear [min]', 'feature_1', 'feature_2', 'feature_3', 'feature_4']
CATEGORICAL_FEATURES = ['Type_M', 'Type_L', 'Type_H']
TARGET = "Target"

### Importing Libraries

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from category_encoders import OneHotEncoder

### Importing Data

In [5]:
data = pd.read_csv(DATA_PATH)
data.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,feature_1,feature_2,feature_3,feature_4
0,M,298.1,308.6,1551,42.8,0,0,91993.66,36.238318,0,0.0
1,L,298.2,308.7,1408,46.3,3,0,92054.34,30.410367,4224,138.9
2,L,298.1,308.5,1498,49.4,5,0,91963.85,30.323887,7490,247.0
3,L,298.2,308.6,1433,39.5,7,0,92024.52,36.278481,10031,276.5
4,L,298.2,308.7,1408,40.0,9,0,92054.34,35.2,12672,360.0


### 01. Data Encoding

In [6]:
ohe = OneHotEncoder(use_cat_names=True)
encoded_columns = ohe.fit_transform(data['Type'])
data = pd.concat([data, encoded_columns], axis=1).drop(columns="Type")
data.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,feature_1,feature_2,feature_3,feature_4,Type_M,Type_L,Type_H
0,298.1,308.6,1551,42.8,0,0,91993.66,36.238318,0,0.0,1,0,0
1,298.2,308.7,1408,46.3,3,0,92054.34,30.410367,4224,138.9,0,1,0
2,298.1,308.5,1498,49.4,5,0,91963.85,30.323887,7490,247.0,0,1,0
3,298.2,308.6,1433,39.5,7,0,92024.52,36.278481,10031,276.5,0,1,0
4,298.2,308.7,1408,40.0,9,0,92054.34,35.2,12672,360.0,0,1,0


### 02. Data Splitting

In [7]:
X = data.drop(columns=TARGET)
y = data[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (8000, 12)
X_test shape: (2000, 12)
y_train shape: (8000,)
y_test shape: (2000,)


### 03. Data Scaling

In [9]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train[NUMERICAL_FEATURES])
X_test_scaled = ss.transform(X_test[NUMERICAL_FEATURES])
X_train[NUMERICAL_FEATURES] = X_train_scaled
X_test[NUMERICAL_FEATURES] = X_test_scaled

### 04. Exporting Preprocessed Data

In [11]:
X_train.to_csv("data/X_train.csv", index=False)
X_test.to_csv("data/X_test.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)