In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)




In [2]:
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import *
from utils.constants import *

# Data

To make valid comparison across different methods, we split the original `stack_train` into new train and validation data sets.

In [3]:
# Import data


y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))



In [4]:
stack_train.columns

Index(['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
       'pH, water, unfiltered, field, standard units (Maximum)',
       'pH, water, unfiltered, field, standard units (Minimum)',
       'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
       'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
       'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
       'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
       'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
       'Temperature, water, degrees Celsius (Mean)',
       'Temperature, water, degrees Celsius (Minimum)',
       'Temperature, water, degrees Celsius (Maximum)', 'Date', 'Location_ID',
       'Month', 'Week', 'Weekday', 'Season', 'Season_Num'],
      dtype='object')

In [5]:
stack_train = stack_train.astype(column_data_extended_types)
stack_test = stack_test.astype(column_data_extended_types)

# Feature Engineering

In [6]:


# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID',
                    #    'Year',
                       'Month',
                       'Week',
                       'Weekday']  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler()  # You can use other scalers as well
categorical_transformer = OneHotEncoder(drop=None)  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)



# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed  = preprocessor.transform(stack_test)


# Get the column names after one-hot encoding
categorical_encoded_columns = preprocessor.named_transformers_['cat']\
                                    .get_feature_names_out(input_features=categorical_columns)

# # Get the column names after one-hot encoding
# categorical_encoded_columns = preprocessor.named_transformers_['cat']\
#                                     .named_steps['onehotencoder']\
#                                     .get_feature_names_out(input_features=categorical_columns)

# Convert X_train_preprocessed and X_test_preprocessed to DataFrames

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))


# Converting the preprocessed data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train_preprocessed, label=y_train, feature_names=np.concatenate([numeric_columns, categorical_encoded_columns]))
dval = xgb.DMatrix(X_test_preprocessed, label=y_test, feature_names=np.concatenate([numeric_columns, categorical_encoded_columns]))


## NOTE: 

Pretend we have the best model already.

This is a TODO!

# XGboost

In [7]:
# Defining XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'max_depth': 7,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Training the XGBoost model
model_xgb = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'validation')], early_stopping_rounds=10)

# Making predictions
y_pred_xgb = model_xgb.predict(dval)

# Calculating RMSE
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
print("XGBoost RMSE:", rmse_xgb)

Parameters: { "n_estimators" } are not used.

[0]	validation-rmse:0.14855
[1]	validation-rmse:0.13389
[2]	validation-rmse:0.12068
[3]	validation-rmse:0.10880
[4]	validation-rmse:0.09815
[5]	validation-rmse:0.08857
[6]	validation-rmse:0.07998
[7]	validation-rmse:0.07223
[8]	validation-rmse:0.06530
[9]	validation-rmse:0.05911
[10]	validation-rmse:0.05353
[11]	validation-rmse:0.04851
[12]	validation-rmse:0.04406
[13]	validation-rmse:0.04008
[14]	validation-rmse:0.03653
[15]	validation-rmse:0.03336
[16]	validation-rmse:0.03054
[17]	validation-rmse:0.02804
[18]	validation-rmse:0.02580
[19]	validation-rmse:0.02384
[20]	validation-rmse:0.02211
[21]	validation-rmse:0.02061
[22]	validation-rmse:0.01932
[23]	validation-rmse:0.01819
[24]	validation-rmse:0.01719
[25]	validation-rmse:0.01633
[26]	validation-rmse:0.01561
[27]	validation-rmse:0.01499
[28]	validation-rmse:0.01444
[29]	validation-rmse:0.01399
[30]	validation-rmse:0.01359
[31]	validation-rmse:0.01325
[32]	validation-rmse:0.01297
[33]	va

In [8]:
dtrain.feature_names

['Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)',
 'pH, water, unfiltered, field, standard units (Maximum)',
 'pH, water, unfiltered, field, standard units (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)',
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)',
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)',
 'Temperature, water, degrees Celsius (Mean)',
 'Temperature, water, degrees Celsius (Minimum)',
 'Temperature, water, degrees Celsius (Maximum)',
 'Location_ID_2198840.0',
 'Location_ID_2198920.0',
 'Location_ID_2198950.0',
 'Location_ID_21989715.0',
 'Location_ID_21989773.0',
 'Location_ID_21989792.0',
 'Location_ID_21989793.0',
 'Location_ID_219897945.

In [9]:
# Get feature importance scores
xgb_feature_importance = model_xgb.get_score(importance_type='weight')

# Sort feature importance scores in descending order
xgb_sorted_feature_importance = sorted(xgb_feature_importance.items(), key=lambda x: x[1], reverse=True)

# Print feature importance scores
print("Feature Importance:")
for feature, importance in xgb_sorted_feature_importance:
    print(f"{feature}: {importance}")

Feature Importance:
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum): 1200.0
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean): 803.0
pH, water, unfiltered, field, standard units (Minimum): 652.0
Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum): 515.0
Temperature, water, degrees Celsius (Maximum): 418.0
pH, water, unfiltered, field, standard units (Maximum): 409.0
Temperature, water, degrees Celsius (Mean): 383.0
Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum): 375.0
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum): 351.0
Temperature, water, degrees Celsius (Minimum): 244.0
Dissolved oxygen, water, unfiltered, milligrams per liter (Mean): 214.0
Location_ID_2344630.0: 116.0
Month_3: 69.0
Location_ID_21989792.0: 61.0
Weekday_3: 57.0
Location_ID_2336120.0: 55.0
Weekday_2: 39.0
Weekday_5: 39.0
Location

# Random Forest

In [10]:
# Creating and training the RandomForest model
model_rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model_rf.fit(X_train_preprocessed_df, y_train)

# Making predictions
y_pred_rf = model_rf.predict(X_test_preprocessed_df)

# Calculating RMSE
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("RandomForest RMSE:", rmse_rf)

# Get feature importances
rf_feature_importance = model_rf.feature_importances_

# Create a dictionary mapping feature names to their importances
rf_feature_importance_dict = dict(zip(X_train_preprocessed_df.columns, rf_feature_importance))

# Sort feature importances in descending order
rf_sorted_feature_importance = sorted(rf_feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print feature importances
print("Feature Importances:")
for feature, importance in rf_sorted_feature_importance:
    print(f"{feature}: {importance}")

  model_rf.fit(X_train_preprocessed_df, y_train)


RandomForest RMSE: 0.012304093390213964
Feature Importances:
Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum): 0.8095464679098008
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean): 0.03131644954359078
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum): 0.029235365047592367
Location_ID_21989792.0: 0.026377008068810028
pH, water, unfiltered, field, standard units (Minimum): 0.01942097864060695
Temperature, water, degrees Celsius (Maximum): 0.009725502270943352
Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum): 0.007736037621157964
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum): 0.006903398126047333
pH, water, unfiltered, field, standard units (Maximum): 0.006703333370896553
Temperature, water, degrees Celsius (Mean): 0.005258924220281276
Location_ID_2336120.0: 0.005171545694019389
Location_ID_2344630.0

### Note:

The numbers you see in the feature importance scores from XGBoost and RandomForest are indeed at different scales, and this is because they are calculated differently by these two algorithms.

1. XGBoost's Feature Importance (Weight):
    
    In XGBoost, the "weight" importance type represents the number of times a feature appears in the trees of the boosted ensemble. Features that are frequently used in the tree splits contribute more to the prediction process, and hence, they are assigned higher importance scores.

2. RandomForest's Feature Importance:
    
    In RandomForest, the feature importance is usually calculated using the Gini impurity or Mean Decrease in Impurity (MDI). The importance of a feature is computed as the average (or total) reduction in impurity across all the decision trees in the forest, that is achieved by splitting the data based on that feature.

Due to these different calculation methods, the importance scores are inherently on different scales. In RandomForest's case, the importance scores are relative to the Gini impurity reduction, while in XGBoost's case, they are based on the frequency of feature use in the ensemble of decision trees.

It's important to note that the exact values of importance scores are not directly comparable between different algorithms like XGBoost and RandomForest. The key takeaway is to interpret the importance scores within each algorithm's context to identify which features are most influential within the specific model's prediction mechanism.


We can unify the feature importance scores from different algorithms for better comparison, but keep in mind that their underlying meanings are different due to the distinct calculation methods used by each algorithm. Unifying the scores could help provide a relative sense of feature importance across different models, but the absolute values might not be directly comparable.

One approach to unify the scores is to normalize them across each algorithm's scores so that they are on the same scale (e.g., between 0 and 1). Here's a general idea of how you might do it:

1. Normalize Feature Importance Scores within Algorithm:
    
    Normalize the importance scores within each algorithm so that they are scaled between 0 and 1. You can use the Min-Max scaling formula.

2. Compare Normalized Scores:

    After normalizing the scores for each algorithm, you can compare the relative importance of features across algorithms. Keep in mind that the absolute values are no longer meaningful in this context, but the relative order of importance can still provide insights.

In [11]:
from sklearn.preprocessing import MinMaxScaler

# Get feature importance scores from XGBoost
feature_importance_xgb = model_xgb.get_score(importance_type='weight')

# Normalize XGBoost feature importance scores
scaler = MinMaxScaler()
normalized_importance_xgb = scaler.fit_transform(np.array(list(feature_importance_xgb.values())).reshape(-1, 1))

# Create a dictionary mapping features to normalized importance scores
normalized_feature_importance_xgb = dict(zip(feature_importance_xgb.keys(), normalized_importance_xgb.flatten()))

# Get feature importances from RandomForest
feature_importance_rf = model_rf.feature_importances_

# Normalize RandomForest feature importance scores
normalized_importance_rf = scaler.fit_transform(feature_importance_rf.reshape(-1, 1))

# Create a dictionary mapping features to normalized importance scores
normalized_feature_importance_rf = dict(zip(X_train_preprocessed_df.columns, normalized_importance_rf.flatten()))

# Compare normalized importance scores across algorithms
print("Normalized Feature Importance:")
for feature in X_train_preprocessed_df.columns:
    importance_xgb = normalized_feature_importance_xgb.get(feature, 0)
    importance_rf = normalized_feature_importance_rf.get(feature, 0)
    print(f"{feature}: XGBoost - {importance_xgb:.4f}, RandomForest - {importance_rf:.4f}")


Normalized Feature Importance:
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum): XGBoost - 1.0000, RandomForest - 0.0361
pH, water, unfiltered, field, standard units (Maximum): XGBoost - 0.3403, RandomForest - 0.0083
pH, water, unfiltered, field, standard units (Minimum): XGBoost - 0.5430, RandomForest - 0.0240
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum): XGBoost - 0.2919, RandomForest - 0.0085
Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean): XGBoost - 0.6689, RandomForest - 0.0387
Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum): XGBoost - 0.4287, RandomForest - 1.0000
Dissolved oxygen, water, unfiltered, milligrams per liter (Mean): XGBoost - 0.1776, RandomForest - 0.0061
Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum): XGBoost - 0.3119, RandomForest - 0.0096
Temperature, water, degrees Ce

In [12]:
normalized_feature_importance_rf

{'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)': 0.036107566070087284,
 'pH, water, unfiltered, field, standard units (Maximum)': 0.008274494395761275,
 'pH, water, unfiltered, field, standard units (Minimum)': 0.02398417994030114,
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)': 0.008521627754595373,
 'Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)': 0.03867826074904784,
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)': 1.0,
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)': 0.006122910034533292,
 'Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)': 0.009550159716507087,
 'Temperature, water, degrees Celsius (Mean)': 0.0064902636618009895,
 'Temperature, water, degrees Celsius (Minimum)': 0.005525781941610396,
 'Temperature, water, degrees Celsius (Maximum)': 0.012007

### Shap: Feature Importance

In [15]:
import shap
import numpy as np


# from numba import jit

# @jit(nopython=True)  # Explicitly set nopython=True
# def my_function():
#     # Function code here



# the below code is only for shap package
np.bool = np.bool_
np.int = np.int_

In [None]:
# Shap Values

explainer_lin = shap.LinearExplainer(reg_lin, X_train)

shap_values_lin = explainer_lin.shap_values(X_train)

shap.summary_plot(shap_values_lin, X_train, plot_type="bar")

shap.summary_plot(shap_values_lin, X_train)

In [None]:
reg_lin_lasso_best = ElasticNet(**clf_cv_lasso.best_params_)

reg_lin_lasso_best.fit(X_train,y_train)

explainer_lin = shap.LinearExplainer(reg_lin_lasso_best, X_train)

shap_values_lin = explainer_lin.shap_values(X_train)

shap.summary_plot(shap_values_lin, X_train, plot_type="bar")

shap.summary_plot(shap_values_lin, X_train)

In [None]:
# Get the best hyperparameters and best model
best_params_lgb = clf_cv_lgb.best_params_
best_model_lgb = clf_cv_lgb.best_estimator_


print("Best Hyperparameters:", best_params_lgb)

# Making predictions on the validation data using the best model
y_pred_lgb = best_model_lgb.predict(X_test)


preds['lightGBM'] = y_pred_NN

# Calculating RMSE on the validation data
rmse_lgb = mean_squared_error(y_test, y_pred_lgb, squared=False)
print("LightGBM RMSE on Validation Data with Best Model:", rmse_lgb)


reg_lgb = lgb.LGBMRegressor(objective='regression', metric='rmse', **best_params_lgb)



LGBM_model=reg_lgb.fit(X_train, y_train)


explainer_lightGBM = shap.TreeExplainer(LGBM_model)
shap_values_lightGBM = explainer_lightGBM.shap_values(X_train)
shap.summary_plot(shap_values_lightGBM, X_train, plot_type="bar")

shap.summary_plot(shap_values_lightGBM, X_train)s