In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error,mean_absolute_error


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_dir)

from utils.utils import *
from utils.constants import *
from src.shap import *





  @jit
  @jit
  @jit
  @jit
  @jit
  @numba.jit
  @numba.jit
  @numba.jit
  @numba.jit
  @jit # we can't use this when using a custom link function...
  @jit
  @jit
  @jit
  @jit
  @jit
  @jit
  from .autonotebook import tqdm as notebook_tqdm
  @jit
[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m
[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.[0m


In [2]:
######## Import data ########


y_train = pd.read_csv(get_absolute_path('y_train.csv', 'data'))
y_test = pd.read_csv(get_absolute_path('y_test.csv', 'data'))


stack_train = pd.read_csv(get_absolute_path('stacked_X_tr.csv', 'data'))
stack_test  = pd.read_csv(get_absolute_path('stacked_X_te.csv', 'data'))

stack_train = stack_train.astype(column_data_extended_types)
stack_test = stack_test.astype(column_data_extended_types)



######## Feature Engineering ##########

# Select numeric and categorical columns
numeric_columns = stack_train.select_dtypes(include=['float64']).columns
categorical_columns = [#'Date', 
                       'Location_ID',
                    #    'Year',
                       'Month',
                       'Week',
                       'Weekday',
                       'Season'
                       ]  # Add any categorical columns here

# Create preprocessing transformers
numeric_transformer = StandardScaler()  # we can use other scalers as well
categorical_transformer = OneHotEncoder(drop=None)  # Use one-hot encoding for categorical columns

# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Fit the preprocessor on training data and transform both train and test data
X_train_preprocessed = preprocessor.fit_transform(stack_train)
X_test_preprocessed  = preprocessor.transform(stack_test)


# Get the column names after one-hot encoding
categorical_encoded_columns = preprocessor.named_transformers_['cat']\
                                    .get_feature_names_out(input_features=categorical_columns)

# Convert X_train_preprocessed and X_test_preprocessed to DataFrames

X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed.toarray(), columns=np.concatenate([numeric_columns, categorical_encoded_columns]))

In [3]:
########### Generate SHAP results on test set


# List of model names
model_names = [#'xgb', 'rf' ,
     'mlp'#, 'lin'
]

# Relative path
rel_path = 'results' 

# # Dictionary to store file paths for each model
# model_file_paths = {}

# Loop through each model name
for model_name in model_names:
    file_paths = generate_file_paths_for_shap_2(model_name, rel_path)
    # model_file_paths[model_name] = file_paths
    print(f"{model_name.capitalize()} Model File Paths:", file_paths)

    # gen_shap_results(
    #     load_file_path = file_paths[0]
    #     , save_file_path_1 = file_paths[1]
    #     , save_file_path_2 = file_paths[2]
    #     , refit_X = X_test_preprocessed_df
    #     , refit_y = y_test
    #     , figure_dpi = 300
    # )




Mlp Model File Paths: ['/Users/yinpuli/Documents/python-projects/water-quality-prediction/results/best_mlp_model.joblib', '/Users/yinpuli/Documents/python-projects/water-quality-prediction/results/shap_on_test/best_mlp_shap_bar.png', '/Users/yinpuli/Documents/python-projects/water-quality-prediction/results/shap_on_test/best_mlp_shap_val.png']


In [4]:
load_file_path = file_paths[0]
save_file_path_1 = file_paths[1]
save_file_path_2 = file_paths[2]
refit_X = X_test_preprocessed_df
refit_y = y_test
figure_dpi = 300

In [5]:
best_model, best_model_info = load_model(load_file_path)

In [6]:
refit_X.iloc[:100, :]

Unnamed: 0,"Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Maximum)","pH, water, unfiltered, field, standard units (Maximum)","pH, water, unfiltered, field, standard units (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Minimum)","Specific conductance, water, unfiltered, microsiemens per centimeter at 25 degrees Celsius (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Maximum)","Dissolved oxygen, water, unfiltered, milligrams per liter (Mean)","Dissolved oxygen, water, unfiltered, milligrams per liter (Minimum)","Temperature, water, degrees Celsius (Mean)","Temperature, water, degrees Celsius (Minimum)",...,Weekday_1,Weekday_2,Weekday_3,Weekday_4,Weekday_5,Weekday_6,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,-0.389811,0.599225,-0.223405,-0.308976,0.092114,0.233673,0.317689,0.483692,0.072549,0.174000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.105840,0.967615,-0.218852,0.505253,0.037530,1.411519,-0.198485,-0.432755,-0.019343,0.023018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.888414,-0.137555,-0.173973,0.392389,-0.017054,1.018904,-0.146868,-0.249466,-0.004028,0.038116,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2.023862,1.336005,0.265881,1.390570,-0.399141,2.196750,-0.508190,-0.341110,-0.095920,-0.037374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2.676141,2.072784,1.776506,2.328655,-0.672061,2.196750,-0.508190,-0.341110,-0.126551,-0.037374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.395246,0.599225,-0.228284,-0.313959,-0.180806,0.233673,0.111219,0.300403,0.210388,0.158902,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
96,-0.396213,-1.979504,-0.229584,-0.314985,-0.235390,-2.122020,-0.250103,-0.157821,-0.080605,-0.097767,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
97,-0.384617,-0.874335,-0.218202,-0.303259,-0.344557,-0.944174,-0.301720,-0.341110,0.133811,0.098509,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
98,-0.392589,-0.137555,-0.225194,-0.310881,0.365033,-0.158943,-0.095250,-0.249466,0.087865,0.007920,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [8]:
refit_X_summary = shap.kmeans(refit_X.iloc[:100,:], 10)
# Use the predict method of MLPRegressor to get predictions
predict_fn = best_model.predict
print("A")
explainer = shap.KernelExplainer(predict_fn, refit_X_summary)
print("B")
shap_values = explainer.shap_values(refit_X.iloc[:100, :], nsample=100) # TODO: why not refit_X_summary?
print("C")
shap.summary_plot(shap_values, refit_X.iloc[:100,:], plot_type = 'bar') # TODO: why not refit_X_summary?
plt.gcf().set_size_inches(15, 10)
# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(save_file_path_1), exist_ok=True)

plt.savefig(save_file_path_1, dpi=figure_dpi)
plt.show()