In [2]:
# preprocessing_and_split.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


df = pd.read_csv("training data.csv")


X = df.drop(columns=["condition"])
y = df["condition"]


label_mapping = {"Healthy": 0, "Monitor": 1, "Repair Required": 2}
y = y.map(label_mapping)


numeric_features = [
    "corrosion rate (Mpy)", "Diameter Size (inch)",
    "Line Length (KM)", "Pigging Frequency (Days)",
    "Surfactant Dosing (Litres)"]
categorical_features = ["Probe Type"]


preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features),
    ]
)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)


print("After preprocessing and split:")
print("X_train_transformed shape:", X_train_transformed.shape)
print("X_test_transformed shape:", X_test_transformed.shape)
print("y_train distribution:\n", y_train.value_counts(normalize=True))
print("y_test distribution:\n", y_test.value_counts(normalize=True))


After preprocessing and split:
X_train_transformed shape: (144, 7)
X_test_transformed shape: (37, 7)
y_train distribution:
 1    0.826389
0    0.104167
2    0.069444
Name: condition, dtype: float64
y_test distribution:
 1    0.810811
0    0.108108
2    0.081081
Name: condition, dtype: float64
