In [9]:
# 02_preprocessing.ipynb

# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 📥 Load the cleaned dataset
df = pd.read_csv("../data/clean_tempering_data.csv")

# 🛠️ Impute missing values in predictor columns
print("Before dropping Initial HRC:", df.shape)
df = df.drop(columns=['Initial hardness (HRC) - post quenching'])
print("After dropping Initial HRC:", df.shape)


# 🧽 Remove rows with missing target
print("Before dropping NA:", df.shape)
df = df.dropna(subset=['Final hardness (HRC) - post tempering'])
print("After dropping NA:", df.shape)

# 🎯 Define features and target
X = df.drop(columns=['Final hardness (HRC) - post tempering'])
y = df['Final hardness (HRC) - post tempering']

# 🔢 One-hot encode categorical variables
X = pd.get_dummies(X, columns=['Steel type', 'Source'], drop_first=True)

# 📊 Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# ✂️ Train/Val/Test Split (60/20/20)
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42  # 0.25 x 0.8 = 0.2
)

# 🧪 Print shapes
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

# 💾 Save processed data to /processed_data
import os
os.makedirs("../processed_data", exist_ok=True)

X_train.to_csv("../processed_data/X_train.csv", index=False)
X_val.to_csv("../processed_data/X_val.csv", index=False)
X_test.to_csv("../processed_data/X_test.csv", index=False)
y_train.to_csv("../processed_data/y_train.csv", index=False)
y_val.to_csv("../processed_data/y_val.csv", index=False)
y_test.to_csv("../processed_data/y_test.csv", index=False)

print("✅ Data preprocessing completed and files saved.")


Before dropping Initial HRC: (1466, 17)
After dropping Initial HRC: (1466, 16)
Before dropping NA: (1466, 16)
After dropping NA: (1466, 16)
Train shape: (879, 48)
Validation shape: (293, 48)
Test shape: (294, 48)
✅ Data preprocessing completed and files saved.
