# Imports

In [1]:
pip install joblib


Note: you may need to restart the kernel to use updated packages.


In [24]:
## Our standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as miss

## Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

## Models & evaluation metrics
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib
import shap

## setting random state for reproducibility
SEED = 321
np.random.seed(SEED)
## Matplotlib style
fav_style = ('ggplot','tableau-colorblind10')
fav_context  ={'context':'notebook', 'font_scale':1.1}
plt.style.use(fav_style)
sns.set_context(**fav_context)
plt.rcParams['savefig.transparent'] = False
plt.rcParams['savefig.bbox'] = 'tight'

# Modeling codes

In [27]:
def evaluate_regression(model, X_train,y_train, X_test, y_test,for_slides=True): 
    """Evaluates a scikit learn regression model using r-squared and RMSE
    FOR SLIDES VERS DOES MULTIPLE PRINT STATEMENTS FOR VERTICAL DISPLAY OF INFO"""
    
    ## Training Data
    y_pred_train = model.predict(X_train)
    r2_train = metrics.r2_score(y_train, y_pred_train)
    rmse_train = metrics.mean_squared_error(y_train, y_pred_train, 
                                            squared=False)
    mae_train = metrics.mean_absolute_error(y_train, y_pred_train)
    

    ## Test Data
    y_pred_test = model.predict(X_test)
    r2_test = metrics.r2_score(y_test, y_pred_test)
    rmse_test = metrics.mean_squared_error(y_test, y_pred_test, 
                                            squared=False)
    mae_test = metrics.mean_absolute_error(y_test, y_pred_test)
    
    if for_slides:
        df_version =[['Split','R^2','MAE','RMSE']]
        df_version.append(['Train',r2_train, mae_train, rmse_train])
        df_version.append(['Test',r2_test, mae_test, rmse_test])
        df_results = pd.DataFrame(df_version[1:], columns=df_version[0])
        df_results = df_results.round(2)
        display(df_results.style.hide(axis='index').format(precision=2, thousands=','))
        
    else: 
        print(f"Training Data:\tR^2 = {r2_train:,.2f}\tRMSE = {rmse_train:,.2f}\tMAE = {mae_train:,.2f}")
        print(f"Test Data:\tR^2 = {r2_test:,.2f}\tRMSE = {rmse_test:,.2f}\tMAE = {mae_test:,.2f}")

def get_coefficients(lin_reg):
    coeffs = pd.Series(lin_reg.coef_, index= lin_reg.feature_names_in_)
    coeffs.loc['intercept'] = lin_reg.intercept_
    return coeffs

def plot_coefficients(coeffs, sort_values=True, top_n=None, figsize=(6,4),
                     title="Linear Regression Coefficients", xlabel='Coefficient'):
    """Plots a Series of coefficients as horizotal bar chart, with option to sort
    and to only keep top_n coefficients"""
        
    if top_n is not None:
        top_n = coeffs.abs().rank().sort_values(ascending=False).head(top_n)
        coeffs = coeffs.loc[top_n.index]
        
    if sort_values:
        coeffs = coeffs.sort_values()

        
        
    ax = coeffs.plot(kind='barh', figsize=figsize)
    ax.axvline(0, color='k')
    ax.set(xlabel=xlabel, title=title);
    plt.show()
    return ax


def get_importances(rf_reg):
    importances = pd.Series(rf_reg.feature_importances_, index= rf_reg.feature_names_in_)
    return importances


def plot_importances(importances, sort_values=True, top_n=None, figsize=(6,4),
                     title="Feature Importance", xlabel='Importance'):
    if sort_values:
        importances = importances.sort_values()
        
    if top_n is not None:
        importances = importances.tail(top_n)
        
        
    ax = importances.plot(kind='barh', figsize=figsize)
    ax.axvline(0, color='k')
    ax.set(xlabel=xlabel, title=title);
    plt.show()
    return ax

In [28]:
## Saving the loaded objects as separate varaibles
X_train = loaded['X_train']
X_test = loaded['X_test']
y_train = loaded['y_train']
y_test = loaded['y_test']

preprocessor = loaded['preprocessor']
lin_reg = loaded['LinearRegression']
rf_reg = loaded['RandomForestRegressor']

In [29]:
## Use our evaluate_regression function to evalaute the linear regression
evaluate_regression(lin_reg,X_train,y_train, X_test, y_test)

Split,R^2,MAE,RMSE
Train,0.49,808.14,1086.63
Test,0.53,768.94,1064.4


# Global Explanations

In [2]:
# Define the file path
file_path = "G:\\Github Desktop Reps\\Coding Dojo\\Prediction-of-Product-Sales\\best-models.joblib"

# Load the joblib file
loaded = joblib.load(file_path)

In [16]:
loaded.keys()

dict_keys(['preprocessor', 'X_train', 'X_test', 'y_train', 'y_test', 'LinearRegression', 'RandomForestRegressor'])

In [19]:
# Extract objects and save them as separate variables

X_train = loaded['X_train']
X_test = loaded['X_test']
y_train = loaded['y_train']
y_test = loaded['y_test']
lin_reg = loaded['LinearRegression']
rf_reg = loaded['RandomForestRegressor']

In [18]:
# Create a subset of your training data for SHAP explanation
X_shap = X_train.sample(n=100, random_state=42)
y_shap = y_train.loc[X_shap.index]

KeyError: '[3501, 4913, 3720, 3414, 4915, 4504, 3063, 3768, 2503, 79, 101, 4662, 3842, 4392, 2874, 4643, 3395, 3007, 577, 2053, 2589, 4083, 2927, 1194, 1485, 2926, 4734, 5336, 3366, 2543, 3333, 168, 2899, 23, 333, 5169] not in index'

I've encountered many errors trying to fit my data here, aybe because I got rid of missing values in the previous notebook, then did the joblib

In [22]:
# Specify the number of samples you want for SHAP explanation
num_samples = 100  # Adjust this as needed

# Randomly select a subset of your training data for SHAP explanation
sample_indices = X_train.sample(num_samples, random_state=42).index

# Create X_shap and y_shap from the randomly selected indices
X_shap = X_train.loc[sample_indices]
y_shap = y_train.loc[sample_indices]

KeyError: '[3501, 4913, 3720, 3414, 4915, 4504, 3063, 3768, 2503, 79, 101, 4662, 3842, 4392, 2874, 4643, 3395, 3007, 577, 2053, 2589, 4083, 2927, 1194, 1485, 2926, 4734, 5336, 3366, 2543, 3333, 168, 2899, 23, 333, 5169] not in index'

In [10]:
explainer = shap.Explainer(rf_reg, X_shap)

In [11]:
shap_values = explainer.shap_values(X_shap)

ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. This check failed because for one of the samples the sum of the SHAP values was 1418.636106, while the model output was 1441.863138. If this difference is acceptable you can set check_additivity=False to disable this check.

In [None]:
shap.summary_plot(shap_values, X_shap, plot_type='bar')

In [None]:
# Save the bar summary plot as a .png file
shap.summary_plot(shap_values, X_shap, plot_type='bar', show=False)
plt.savefig('bar_summary_plot.png', bbox_inches='tight')

In [None]:
shap.summary_plot(shap_values, X_shap, plot_type='dot')

In [None]:
# Save the dot summary plot as a .png file
shap.summary_plot(shap_values, X_shap, plot_type='dot', show=False)
plt.savefig('dot_summary_plot.png', bbox_inches='tight')

# Local Explanations

In [30]:
X_shap_local = X_shap.reset_index(drop=True)
y_shap_local = y_shap.reset_index(drop=True)
X_shap_local.head()

NameError: name 'y_shap' is not defined

Will choose X_low_sales and X_high_sales once my code above works

In [31]:
from lime.lime_tabular import LimeTabularExplainer

# Create a Lime explainer
explainer = LimeTabularExplainer(X_train.values, mode="regression")

# Generate Lime explanation for the example with low sales
explanation_low_sales = explainer.explain_instance(X_low_sales, lin_reg.predict)

# Generate Lime explanation for the example with high sales
explanation_high_sales = explainer.explain_instance(X_high_sales, lin_reg.predict)


count    100.000000
mean       0.114022
std        1.028314
min       -1.741691
25%       -0.709794
50%        0.118291
75%        0.888107
max        1.978781
Name: Item_MRP, dtype: float64

In [None]:
import shap

# Create a SHAP explainer
explainer = shap.Explainer(lin_reg, X_train)

# Calculate SHAP values for the example with low sales
shap_values_low_sales = explainer.shap_values(X_low_sales)

# Calculate SHAP values for the example with high sales
shap_values_high_sales = explainer.shap_values(X_high_sales)


In [None]:
# Save Lime tabular explanations as .png files
explanation_low_sales.save_to_file("lime_explanation_low_sales.png")
explanation_high_sales.save_to_file("lime_explanation_high_sales.png")

# Create and save SHAP Force Plots as .png files
shap.summary_plot(shap_values_low_sales, X_low_sales, show=False)
plt.savefig("shap_force_plot_low_sales.png")

shap.summary_plot(shap_values_high_sales, X_high_sales, show=False)
plt.savefig("shap_force_plot_high_sales.png")
