In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC
import random
import pandas as pd

BLUE = "#0D0887"
RED = "#FCD025"

random.seed(42)
df = pd.read_csv("data/predictive_maintenance_original.csv")
df = df.drop(["UDI", "Product ID", "Failure Type"], axis=1)
df = df.rename(columns={"Air temperature [K]": "AirTemperature",
                        "Process temperature [K]": "ProcessTemp", "Tool wear [min]": "ToolWear",
                        "Torque [Nm]": "Torque", "Rotational speed [rpm]": "RotationalSpeed"})
ohe_type = pd.get_dummies(df.Type, prefix="Type")
df = pd.concat([df, ohe_type], axis=1)
df = df.drop("Type", axis=1)

# split data
train_df, test_df = train_test_split(df, train_size=0.7, random_state=42, stratify=df["Target"])

# oversample train
sm = SMOTENC(random_state=42, sampling_strategy="minority", categorical_features=[5, 6, 7])
y_train = train_df.copy(deep=True)["Target"]
print(y_train.value_counts())
x_train_unscaled = train_df.drop("Target", axis=1)
x_res, y_res = sm.fit_resample(x_train_unscaled, y_train)
x_res = x_res.reset_index()
y_res = y_res.reset_index()
train_df = pd.concat([x_res, y_res], axis=1)
print(train_df["Target"].value_counts())

# scale train
train_scale_exclude = train_df[["Type_M", "Type_L", "Type_H", "Target"]]
train_scale_include = train_df.drop(["Type_M", "Type_L", "Type_H", "Target"], axis=1)

mms_train = MinMaxScaler()
train_scale_include_names = train_scale_include.columns
train_scale_include = mms_train.fit_transform(train_scale_include)
train_scale_include = pd.DataFrame(data=train_scale_include, columns=train_scale_include_names)
train_scale_include = train_scale_include.reset_index()
train_scale_exclude = train_scale_exclude.reset_index()
train_df = pd.concat([train_scale_include, train_scale_exclude], axis=1)
train_df = train_df.drop(["level_0", "index"], axis=1)

# scale test
test_scale_exclude = test_df[["Type_M", "Type_L", "Type_H", "Target"]]
test_scale_include = test_df.drop(["Type_M", "Type_L", "Type_H", "Target"], axis=1)

mms_test = MinMaxScaler()
test_scale_include_names = test_scale_include.columns
test_scale_include = mms_test.fit_transform(test_scale_include)
test_scale_include = pd.DataFrame(data=test_scale_include, columns=test_scale_include_names)
test_scale_include = test_scale_include.reset_index()
test_scale_exclude = test_scale_exclude.reset_index()
test_df = pd.concat([test_scale_include, test_scale_exclude], axis=1)
test_df = test_df.drop(["index"], axis=1)

train_df.to_csv("data/predictive_maintenance_training.csv", index=False)
test_df.to_csv("data/predictive_maintenance_test.csv", index=False)

0    6763
1     237
Name: Target, dtype: int64
0    6763
1    6763
Name: Target, dtype: int64
