In [61]:
import time

import numpy as np
import pandas as pd
from sklearn.ensemble import (
    BaggingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from tqdm.notebook import tqdm

In [62]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 40)

In [63]:
# Load data
input_df = pd.read_csv("input_df.csv", index_col=0)
input_df_pca = pd.read_csv("input_df_pca.csv", index_col=0)
output_df_class = pd.read_csv("output_df_class.csv", index_col=0)
output_df_score = pd.read_csv("output_df_score.csv", index_col=0)
q1, q3 = output_df_score["score"].quantile([0.25, 0.75])

In [64]:
# Drop rows with missing values in the output for supervised learning

supervised_learning_mask = output_df_score.notna().all(axis=1)

input_df = input_df[supervised_learning_mask]
input_df_pca = input_df_pca[supervised_learning_mask]
output_df_class = output_df_class[supervised_learning_mask]
output_df_score = output_df_score[supervised_learning_mask]

In [65]:
# Prepare input data
X_base = input_df.values
X_pca = input_df_pca.values
y = output_df_score.values.ravel()

# Set a random seed for reproducibility
random_state = 123

# Define cross-validation strategy
kf = KFold(n_splits=10, random_state=random_state, shuffle=True)

# Define a scoring metric
scoring = [
    "neg_mean_absolute_error",
    "neg_mean_squared_error",
    "neg_root_mean_squared_error",
    "r2",
]

In [66]:
# Define models and hyperparameters for tuning
models = {
    "Linear Regression": {"model": LinearRegression(), "params": {}},
    "Ridge": {
        "model": Ridge(),
        "params": {"ridge__alpha": [0.001, 0.01, 0.1, 1.0, 10, 100, 1000]},
    },
    "Lasso": {
        "model": Lasso(),
        "params": {"lasso__alpha": [0.001, 0.01, 0.1, 1.0, 10, 100, 1000]},
    },
    "ElasticNet": {
        "model": ElasticNet(),
        "params": {
            "elasticnet__alpha": [0.01, 0.01, 0.1, 1.0, 10, 100],
            "elasticnet__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
        },
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {
            "decisiontree__max_depth": [5, 10, 20],
            "decisiontree__min_samples_split": [2, 5, 10],
        },
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "randomforest__n_estimators": [50, 100],
            "randomforest__max_depth": [5, 10],
            "randomforest__min_samples_split": [2, 5],
        },
    },
    "Gradient Boosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {
            "gradientboosting__n_estimators": [50, 100],
            "gradientboosting__learning_rate": [0.01, 0.1],
            "gradientboosting__max_depth": [3, 5, 10],
        },
    },
    "Support Vector Regressor": {
        "model": SVR(),
        "params": {
            "supportvectorregressor__C": [0.1, 1.0, 10.0],
            "supportvectorregressor__kernel": ["linear", "rbf"],
            "supportvectorregressor__epsilon": [0.01, 0.1, 0.5],
        },
    },
    "K-Nearest Neighbors": {
        "model": KNeighborsRegressor(),
        "params": {"k-nearestneighbors__n_neighbors": [3, 5, 10]},
    },
    "Bagging Regressor": {
        "model": BaggingRegressor(random_state=42),
        "params": {
            "baggingregressor__n_estimators": [10, 50],
            "baggingregressor__max_samples": [0.5, 1.0],
        },
    },
}

In [67]:
# Function to clean the best params dictionary
def clean_params(params):
    return ", ".join(f"{key.split('__')[1]}: {value}" for key, value in params.items())


# Function to categorize the regression output
def categorize_values(y, q1, q3):
    y_categorized = np.empty_like(y)
    y_categorized[y < q1] = 1
    y_categorized[(y >= q1) & (y <= q3)] = 2
    y_categorized[y > q3] = 3
    return y_categorized


# Function to convert the one hot encoding back to multiclass ecoding
def one_hot_encoding_to_multiclass(y):
    return np.einsum("ij,j->i", y, np.arange(1, y.shape[1] + 1))


# 1 dimension multiclass label
y_categorized = one_hot_encoding_to_multiclass(output_df_class.values)

In [68]:
# Create an empty list to store results
results_list = []

# Create an empty list to store results
results_list_base_input, model_dict_base_input = [], {}
results_list_pca_input, model_dict_pca_input = [], {}

input_setups = {
    "Base input": (X_base, results_list_base_input, model_dict_base_input),
    "PCA input": (X_pca, results_list_pca_input, model_dict_pca_input),
}

# Test models on base input and PCA-preprocessed input
for input_name, input_config in input_setups.items():
    X, results_list, model_dict = input_config
    print(f"Starting GridSearch with {input_name} ...")

    # Loop over each model
    for model_name, config in tqdm(models.items()):
        print(f"Training and tuning {model_name}...")

        # Track start time
        start_time = time.time()

        # Create a pipeline with StandardScaler (necessary for some models)
        pipeline = Pipeline(
            [
                (
                    "scaler",
                    StandardScaler(),
                ),
                (model_name.lower().replace(" ", ""), config["model"]),
            ]
        )

        # Set up GridSearchCV
        grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=config["params"],
            cv=kf,
            scoring=scoring,
            refit="neg_mean_squared_error",
            return_train_score=False,
            verbose=1,
        )

        # Fit the grid search
        grid_search.fit(X, y)

        # Get the best estimator's metrics
        best_estimator_index = grid_search.best_index_

        mae = -grid_search.cv_results_["mean_test_neg_mean_absolute_error"][
            best_estimator_index
        ]
        mse = -grid_search.cv_results_["mean_test_neg_mean_squared_error"][
            best_estimator_index
        ]
        rmse = -grid_search.cv_results_["mean_test_neg_root_mean_squared_error"][
            best_estimator_index
        ]
        r2 = grid_search.cv_results_["mean_test_r2"][best_estimator_index]

        # Also determine classification metrics by categorizing the output
        y_pred = grid_search.predict(X)
        y_pred_categorized = categorize_values(y_pred, q1, q3)

        acc = accuracy_score(y_categorized, y_pred_categorized)
        f1_score_micro = f1_score(y_categorized, y_pred_categorized, average="micro")
        f1_score_macro = f1_score(y_categorized, y_pred_categorized, average="macro")
        precision_macro = precision_score(
            y_categorized, y_pred_categorized, average="macro"
        )
        recal_macro = recall_score(y_categorized, y_pred_categorized, average="macro")

        # Append results to list
        results_list.append(
            {
                "Model": model_name,
                "Best Params": clean_params(grid_search.best_params_),
                "MAE": mae,
                "MSE": mse,
                "RMSE": rmse,
                "R2": r2,
                "Accuracy": acc,
                "F1 Score (micro)": f1_score_micro,
                "F1 Score (macro)": f1_score_macro,
                "Precision (macro)": precision_macro,
                "Recall (macro)": recal_macro,
            }
        )

        # Append results to list
        model_dict[model_name] = grid_search.best_estimator_

        # Print duration
        print(f"Done in {time.time() - start_time:.2f} seconds\n")


Starting GridSearch with Base input ...


  0%|          | 0/10 [00:00<?, ?it/s]

Training and tuning Linear Regression...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Done in 0.03 seconds

Training and tuning Ridge...
Fitting 10 folds for each of 7 candidates, totalling 70 fits
Done in 0.08 seconds

Training and tuning Lasso...
Fitting 10 folds for each of 7 candidates, totalling 70 fits
Done in 0.21 seconds

Training and tuning ElasticNet...
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Done in 2.01 seconds

Training and tuning Decision Tree...
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Done in 0.34 seconds

Training and tuning Random Forest...
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Done in 10.59 seconds

Training and tuning Gradient Boosting...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Done in 18.03 seconds

Training and tuning Support Vector Regressor...
Fitting 10 folds for each of 18 candidates, totalling 180 fits
Done in 5.51 seconds

Training and tuning K-Nearest Ne

  0%|          | 0/10 [00:00<?, ?it/s]

Training and tuning Linear Regression...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Done in 0.02 seconds

Training and tuning Ridge...
Fitting 10 folds for each of 7 candidates, totalling 70 fits
Done in 0.09 seconds

Training and tuning Lasso...
Fitting 10 folds for each of 7 candidates, totalling 70 fits
Done in 0.17 seconds

Training and tuning ElasticNet...
Fitting 10 folds for each of 36 candidates, totalling 360 fits
Done in 1.40 seconds

Training and tuning Decision Tree...
Fitting 10 folds for each of 9 candidates, totalling 90 fits
Done in 0.25 seconds

Training and tuning Random Forest...
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Done in 7.92 seconds

Training and tuning Gradient Boosting...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Done in 13.57 seconds

Training and tuning Support Vector Regressor...
Fitting 10 folds for each of 18 candidates, totalling 180 fits
Done in 2.51 seconds

Training and tuning K-Nearest Nei

In [69]:
pd.DataFrame(results_list_base_input).sort_values("MSE", ignore_index=True)

Unnamed: 0,Model,Best Params,MAE,MSE,RMSE,R2,Accuracy,F1 Score (micro),F1 Score (macro),Precision (macro),Recall (macro)
0,Gradient Boosting,"learning_rate: 0.1, max_depth: 5, n_estimators...",0.988418,2.067627,1.402171,0.69151,0.924266,0.924266,0.922508,0.944457,0.906356
1,Support Vector Regressor,"C: 10.0, epsilon: 0.5, kernel: rbf",1.060218,2.488346,1.541305,0.634829,0.846986,0.846986,0.841409,0.874687,0.819886
2,Random Forest,"max_depth: 10, min_samples_split: 2, n_estimat...",1.139196,2.611451,1.586426,0.610904,0.865533,0.865533,0.859224,0.9082,0.831244
3,Bagging Regressor,"max_samples: 1.0, n_estimators: 50",1.12079,2.624444,1.586774,0.613148,0.90881,0.90881,0.90638,0.932915,0.887831
4,K-Nearest Neighbors,n_neighbors: 3,1.226023,3.202574,1.757716,0.525717,0.799073,0.799073,0.791586,0.825618,0.770458
5,Decision Tree,"max_depth: 5, min_samples_split: 5",1.532682,4.428579,2.08338,0.309847,0.670788,0.670788,0.660316,0.6998,0.644804
6,ElasticNet,"alpha: 0.1, l1_ratio: 0.1",1.563207,4.563852,2.106795,0.330484,0.612056,0.612056,0.554521,0.650717,0.535846
7,Lasso,alpha: 0.01,1.569736,4.56954,2.109412,0.328002,0.612056,0.612056,0.566258,0.638569,0.547128
8,Ridge,alpha: 10,1.570945,4.578013,2.111232,0.326894,0.618238,0.618238,0.574303,0.646888,0.554332
9,Linear Regression,,1.598319,4.663134,2.13041,0.314746,0.621329,0.621329,0.577443,0.649902,0.557422


In [70]:
pd.DataFrame(results_list_pca_input).sort_values("MSE", ignore_index=True)

Unnamed: 0,Model,Best Params,MAE,MSE,RMSE,R2,Accuracy,F1 Score (micro),F1 Score (macro),Precision (macro),Recall (macro)
0,Bagging Regressor,"max_samples: 1.0, n_estimators: 50",1.448717,4.117147,2.009347,0.380375,0.860896,0.860896,0.853721,0.903679,0.826097
1,Random Forest,"max_depth: 10, min_samples_split: 2, n_estimat...",1.466164,4.143328,2.015749,0.377576,0.789799,0.789799,0.767398,0.868407,0.731446
2,Gradient Boosting,"learning_rate: 0.1, max_depth: 5, n_estimators...",1.501722,4.437829,2.079683,0.335907,0.774343,0.774343,0.749348,0.852012,0.713947
3,K-Nearest Neighbors,n_neighbors: 5,1.637551,5.148803,2.244553,0.230382,0.658423,0.658423,0.625527,0.697463,0.599625
4,Support Vector Regressor,"C: 10.0, epsilon: 0.5, kernel: rbf",1.587746,5.18938,2.235572,0.242111,0.64915,0.64915,0.574791,0.726885,0.556511
5,ElasticNet,"alpha: 0.1, l1_ratio: 0.3",1.79423,5.834601,2.388714,0.137159,0.536321,0.536321,0.333956,0.749622,0.391947
6,Ridge,alpha: 100,1.790012,5.841311,2.389759,0.136398,0.537867,0.537867,0.336921,0.751932,0.394004
7,Lasso,alpha: 0.01,1.796103,5.848136,2.390283,0.13659,0.534776,0.534776,0.344183,0.729296,0.397068
8,Linear Regression,,1.800292,5.87155,2.394565,0.133791,0.536321,0.536321,0.356683,0.533278,0.403228
9,Decision Tree,"max_depth: 5, min_samples_split: 10",1.765355,5.906456,2.416402,0.087143,0.595054,0.595054,0.459635,0.713295,0.474238


In [71]:
pd.DataFrame(
    model_dict_base_input["Gradient Boosting"][1].feature_importances_,
    index=input_df.columns,
    columns=["Feature Importance"],
).sort_values("Feature Importance", ascending=False)

Unnamed: 0,Feature Importance
manganese_pct,0.181395
post_weld_temp_c,0.13194
nickel_pct,0.097315
molybdenum_pct,0.065612
silicon_pct,0.062672
carbon_pct,0.061737
chromium_pct,0.047638
niobium_ppm,0.044593
phosphorus_pct,0.042794
oxygen_ppm,0.041361
