In [4]:
# tune_random_forest.py
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer 


df = pd.read_csv("training data.csv")
X = df.drop(columns=["condition"])
y = df["condition"].map({"Healthy": 0, "Monitor": 1, "Repair Required": 2})

numeric_features = [
    "corrosion rate (Mpy)", "Diameter Size (inch)",
    "Line Length (KM)", "Pigging Frequency (Days)",
    "Surfactant Dosing (Litres)"]
categorical_features = ["Probe Type"]


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Add imputation for categorical features
    ('onehot', OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")),
])


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('smote', SMOTE(random_state=42)), 
    ('classifier', RandomForestClassifier(random_state=42, class_weight="balanced"))
])


param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [None, 10, 20],
    "classifier__min_samples_split": [2, 5],
    "classifier__min_samples_leaf": [1, 2],
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                      
    scoring="f1_weighted",     
    n_jobs=-1,                 
    verbose=2
)


grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross‐val score:", grid_search.best_score_)


best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("\nTest‐set classification report for the best RF:")
print(classification_report(y_test, y_pred, target_names=["Healthy","Monitor","Repair"],zero_division=0))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters found: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best cross‐val score: 0.9723702716596542

Test‐set classification report for the best RF:
              precision    recall  f1-score   support

     Healthy       1.00      0.75      0.86         4
     Monitor       0.97      1.00      0.98        30
      Repair       1.00      1.00      1.00         3

    accuracy                           0.97        37
   macro avg       0.99      0.92      0.95        37
weighted avg       0.97      0.97      0.97        37



In [5]:

fitted_preprocessor = best_rf.named_steps["preprocessor"]


categorical_pipeline = fitted_preprocessor.named_transformers_["cat"]


ohe = categorical_pipeline.named_steps["onehot"]


cat0 = ohe.categories_[0]
dummy_names = [f"ProbeType_{val}" for val in cat0[1:]]

rf_classifier = best_rf.named_steps["classifier"]


importances = rf_classifier.feature_importances_


all_features = numeric_features + dummy_names



feat_imp_df = pd.DataFrame({
    "feature": all_features,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print("\nFeature Importances (Descending):")
print(feat_imp_df)


Feature Importances (Descending):
                      feature  importance
0        corrosion rate (Mpy)    0.579873
5               ProbeType_LPR    0.162463
6               ProbeType_WLC    0.083239
1        Diameter Size (inch)    0.077456
2            Line Length (KM)    0.064702
4  Surfactant Dosing (Litres)    0.025565
3    Pigging Frequency (Days)    0.006701


In [3]:
import joblib

joblib.dump(best_rf, "best_pipeline_rf_model.pkl")
print("Saved best model to best_pipeline_rf_model.pkl")
joblib.dump(fitted_preprocessor, "preprocessor.pkl")
print("Saved fitted preprocessor to preprocessor.pkl")


Saved best model to best_pipeline_rf_model.pkl
Saved fitted preprocessor to preprocessor.pkl
