In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import xgboost as xgb
import shap

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




  @jit
  @jit
  @jit
  @jit
  @jit
  @numba.jit
  @numba.jit
  @numba.jit
  @numba.jit
  @jit # we can't use this when using a custom link function...
  @jit
  @jit
  @jit
  @jit
  @jit
  @jit
  from .autonotebook import tqdm as notebook_tqdm
  @jit
[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m
[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m


In [2]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import *
from utils.constants import *

# Data

To make valid comparison across different methods, we split the original `df_train` into new train and validation data sets.

In [3]:
# Import data

df_train = pd.read_csv(get_absolute_path('X_train.csv', 'data'))
y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
df_test = pd.read_csv(get_absolute_path('X_test.csv', 'data'))
y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))



In [4]:
df_train.columns = df_test.columns = column_names_raw

In [5]:
df_train.head()

Unnamed: 0,"Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)","pH, water, unfiltered, field, standard units (Maximum)","pH, water, unfiltered, field, standard units (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)","Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)","Temperature, water, degrees Celsius (Mean)","Temperature, water, degrees Celsius (Minimum)","Temperature, water, degrees Celsius (Maximum)"
0,0.001131,0.884615,0.00112,0.001113,0.677632,0.841463,0.765152,0.787402,0.29375,0.298077,0.276163
1,0.00117,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.29375,0.301282,0.276163
2,0.001326,0.884615,0.001198,0.00125,0.677632,0.853659,0.75,0.755906,0.3,0.298077,0.287791
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.27907
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977


# XGBoost

Adding hyperparameter tuning.

In [6]:

# Converting the data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(df_train, label=y_train)

# Define XGBoost parameters grid for tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200]
}

# Create an XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror')

# Define a custom scoring function (negative RMSE since GridSearchCV minimizes the score)
scoring = make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model_xgb, param_grid, cv=5, scoring=scoring, n_jobs=6)
grid_search.fit(df_train, y_train)

# Get the best hyperparameters and best model
best_xgb_params = grid_search.best_params_
best_xgb_model = grid_search.best_estimator_
best_xgb_score = -grid_search.best_score_

print("Best Hyperparameters:", best_xgb_params)

# Making predictions on the validation data using the best model
y_pred_xgb = best_xgb_model.predict(df_test)

# Calculating RMSE on the validation data
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGBoost RMSE on Validation Data with Best Model:", rmse_xgb)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
XGBoost RMSE on Validation Data with Best Model: 0.011138312949183595


In [7]:
df_train

Unnamed: 0,"Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)","pH, water, unfiltered, field, standard units (Maximum)","pH, water, unfiltered, field, standard units (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)","Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)","Temperature, water, degrees Celsius (Mean)","Temperature, water, degrees Celsius (Minimum)","Temperature, water, degrees Celsius (Maximum)"
0,0.001131,0.884615,0.001120,0.001113,0.677632,0.841463,0.765152,0.787402,0.293750,0.298077,0.276163
1,0.001170,0.871795,0.001159,0.001152,0.703947,0.829268,0.772727,0.795276,0.293750,0.301282,0.276163
2,0.001326,0.884615,0.001198,0.001250,0.677632,0.853659,0.750000,0.755906,0.300000,0.298077,0.287791
3,0.014094,0.858974,0.001238,0.003926,0.697368,0.829268,0.772727,0.771654,0.296875,0.294872,0.279070
4,0.088109,0.858974,0.010766,0.029297,0.684211,0.853659,0.765152,0.755906,0.296875,0.291667,0.281977
...,...,...,...,...,...,...,...,...,...,...,...
15646,0.005049,0.910256,0.004676,0.004883,0.888158,0.939024,0.719697,0.535433,0.553125,0.490385,0.584302
15647,0.002456,0.884615,0.002417,0.002422,0.644737,0.865854,0.651515,0.622047,0.556250,0.522436,0.584302
15648,0.003821,0.935897,0.003615,0.003750,0.559211,0.890244,0.636364,0.645669,0.571875,0.554487,0.561047
15649,0.001287,0.897436,0.001277,0.001289,0.631579,0.853659,0.712121,0.724409,0.468750,0.464744,0.462209


In [11]:
best_xgb_model_info = {'best_params': best_xgb_params, 'best_score': best_xgb_score}

best_xgb_file = get_absolute_path(
    file_name = 'best_xgb_model.joblib'
    , rel_path = 'results'
)

save_model(best_xgb_file, best_xgb_model, best_xgb_model_info)


In [12]:
best_xgb_model

In [13]:
# Get feature importance scores
xgb_feature_importance = best_xgb_model.feature_importances_

# Create a list of feature names
feature_names = df_train.columns

# Create a dictionary mapping feature names to their importance scores
xgb_feature_importance_dict = dict(zip(feature_names, xgb_feature_importance))

# Sort feature importance scores in descending order
xgb_sorted_feature_importance = sorted(xgb_feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print feature importance scores
print("Feature Importance:")
for feature, importance in xgb_sorted_feature_importance:
    print(f"{feature}: {importance}")


Feature Importance:
Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum): 0.8866821527481079
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean): 0.02711363695561886
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum): 0.014233870431780815
pH, water, unfiltered, field, standard units (Minimum): 0.011226195842027664
Temperature, water, degrees Celsius (Maximum): 0.010878270491957664
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum): 0.01075922604650259
Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum): 0.010540307499468327
pH, water, unfiltered, field, standard units (Maximum): 0.008121411316096783
Dissolved oxygen, water, unfiltered, milligrams per liter (Mean): 0.006991205736994743
Temperature, water, degrees Celsius (Mean): 0.006967008113861084
Temperature, water, degrees Celsius (Minimum): 0.006486687343567

In [14]:
best_xgb_model_info = {'best_params': best_xgb_params, 'best_score': best_xgb_score}

# Create a dictionary containing the feature importance results
feature_importance_dict = dict(xgb_sorted_feature_importance)

# Add the feature importance dictionary to best_xgb_model_info
best_xgb_model_info['feature_importance'] = feature_importance_dict



best_xgb_file = get_absolute_path(
    file_name = 'best_xgb_model.joblib'
    , rel_path = 'results'
)

# Save the updated best_xgb_model_info using save_model function
save_model(best_xgb_file, best_xgb_model, best_xgb_model_info)

In [15]:
# # Load the model and its info
# best_xgb_file = get_absolute_path(
#     file_name = 'best_xgb_model.joblib'
#     , rel_path = 'results'
# )
# best_xgb_model, best_xgb_model_info = load_model(best_xgb_file)

In [18]:
best_xgb_model

In [16]:
best_xgb_model_info

{'best_params': {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100},
 'best_score': 0.011145224390469813,
 'feature_importance': {'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)': 0.88668215,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)': 0.027113637,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)': 0.01423387,
  'pH, water, unfiltered, field, standard units (Minimum)': 0.011226196,
  'Temperature, water, degrees Celsius (Maximum)': 0.0108782705,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)': 0.010759226,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)': 0.0105403075,
  'pH, water, unfiltered, field, standard units (Maximum)': 0.008121411,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)': 0.0069912057,
  'Temperature, water, degrees Celsius (Mean

In [None]:
# import shap

# # Wrap the XGBoost model in a function
# def xgb_predictor(data):
#     return best_xgb_model.predict(data)

# # Initialize a SHAP explainer with the predictor function
# xgb_explainer = shap.Explainer(xgb_predictor, data=df_test)

# # Calculate SHAP values for a set of data (e.g., df_test)
# xgb_shap_values = xgb_explainer.shap_values(df_test)

# # Create a summary plot of feature importances using SHAP
# shap.summary_plot(xgb_shap_values, df_test, plot_type="bar")


In [35]:
# best_xgb_model

# _xgb_model = best_xgb_model.fit(df_train, y_train)

# # explain the model's predictions using SHAP
# # (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
# _xgb_explainer = shap.Explainer(_xgb_model)
# _xgb_shap_values = _xgb_explainer(df_train)

# # visualize the first prediction's explanation
# shap.plots.waterfall(_xgb_shap_values[0])

# Random Forest

In [24]:
# Define RandomForest parameters grid for tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a RandomForest model
model_rf = RandomForestRegressor()

# Define a custom scoring function (negative RMSE since GridSearchCV minimizes the score)
scoring = make_scorer(lambda y_true, y_pred: -mean_squared_error(y_true, y_pred, squared=False))

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring=scoring, n_jobs=6)
# grid_search.fit(df_train, y_train)
grid_search.fit(df_train, y_train.measurement)

# Get the best hyperparameters and best model
best_rf_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_
best_rf_score = -grid_search.best_score_

print("Best Hyperparameters:", best_rf_params)

# Making predictions on the validation data using the best model
y_pred_rf = best_rf_model.predict(df_test)

# Calculating RMSE on the validation data
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("RandomForest RMSE on Validation Data with Best Model:", rmse_rf)


Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}
RandomForest RMSE on Validation Data with Best Model: 0.01148450307834224


In [25]:
# # Save the best model and results
# best_rf_model_info = {'best_params': best_rf_params, 'best_score': best_rf_score, 'rmse': rmse_rf}
# best_rf_file = get_absolute_path(
#     file_name='best_rf_model.joblib',
#     rel_path='results'
# )
# save_model(best_rf_file, best_rf_model, best_rf_model_info)

In [28]:
# Get feature importance scores for the best Random Forest model
rf_feature_importance = best_rf_model.feature_importances_

# Create a dictionary to map feature names to their importance scores
feature_importance_dict = {feature_name: importance_score for feature_name, importance_score in zip(df_train.columns, rf_feature_importance)}

# Sort feature importance scores in descending order
rf_sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print feature importance scores for Random Forest
print("Random Forest Feature Importance:")
for feature, importance in rf_sorted_feature_importance:
    print(f"{feature}: {importance}")


Random Forest Feature Importance:
Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum): 0.8114432607956025
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean): 0.05523080506466742
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum): 0.03645052629807038
pH, water, unfiltered, field, standard units (Minimum): 0.024152677368460457
Temperature, water, degrees Celsius (Maximum): 0.01893949261600433
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum): 0.011631621147613309
pH, water, unfiltered, field, standard units (Maximum): 0.0103268770299261
Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum): 0.009883523970297617
Temperature, water, degrees Celsius (Mean): 0.008015312669312775
Temperature, water, degrees Celsius (Minimum): 0.00788698311428718
Dissolved oxygen, water, unfiltered, milligrams per liter (Mean): 0.00603

In [29]:
# Save the best model and results
best_rf_model_info = {'best_params': best_rf_params, 'best_score': best_rf_score, 'rmse': rmse_rf}

# Add the feature importance dictionary to best_rf_model_info
best_rf_model_info['feature_importance'] = feature_importance_dict


best_rf_file = get_absolute_path(
    file_name='best_rf_model.joblib',
    rel_path='results'
)
save_model(best_rf_file, best_rf_model, best_rf_model_info)

In [30]:
best_rf_model_info

{'best_params': {'max_depth': 20,
  'min_samples_leaf': 4,
  'min_samples_split': 5,
  'n_estimators': 100},
 'best_score': 0.011567736990528689,
 'rmse': 0.01148450307834224,
 'feature_importance': {'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)': 0.03645052629807038,
  'pH, water, unfiltered, field, standard units (Maximum)': 0.0103268770299261,
  'pH, water, unfiltered, field, standard units (Minimum)': 0.024152677368460457,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)': 0.011631621147613309,
  'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)': 0.05523080506466742,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)': 0.8114432607956025,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)': 0.006038919925758083,
  'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)': 0.0

In [23]:
best_rf_model

In [36]:
# import shap

# # Initialize a SHAP explainer
# rf_explainer = shap.Explainer(best_rf_model)

# # Calculate SHAP values for a set of data (e.g., df_test)
# rf_shap_values = rf_explainer.shap_values(df_test)

# # Create a summary plot of feature importances using SHAP
# shap.summary_plot(rf_shap_values, df_test, plot_type="bar")
