PLEASE NOTE:

This file is only checkng how the created LimeExplainer and SHAPExplainer will look in the notebook for all the 2 models ie logistic and xgboost. 

This should only be run after creating .pkl from the models using src/models/logistic_baseline.py and src/models/xgboost_model.py

In [None]:
from lime.lime_text import LimeTextExplainer
import shap
import numpy as np
from pathlib import Path

In [16]:
class LIMEExplainer:
    def __init__(self, model, class_names):
        self.model = model
        self.class_names = class_names
        self.explainer = LimeTextExplainer(class_names=class_names)
    
    def explain(self, text, num_features=10):
        def predict_proba_fn(texts):
            return self.model.predict_proba(texts)

        probs = self.model.predict_proba([text])[0]

        exp = self.explainer.explain_instance(
            text,
            predict_proba_fn,
            labels=list(np.where(probs > 0.0)[0]),
            num_features=num_features,
            num_samples=500
        )

        explanations = {}

        for class_idx in exp.local_exp:
            class_name = self.class_names[class_idx]
            explanations[class_name] = {
                "word_weights": exp.as_list(label=class_idx),
                "score": probs[class_idx]
            }

        return explanations

    
    def get_top_words(self, explanation, class_name, k=10):
        """Get top-k words for a class"""
        word_weights = explanation[class_name]['word_weights']
        sorted_weights = sorted(word_weights, key=lambda x: abs(x[1]), reverse=True)
        return sorted_weights[:k]

# Usage example
if __name__ == "__main__":
    import pickle
    
    # Load model
    artifact_folder = Path(__name__).resolve().parent.parent / "results/model_artifacts"
    with open(artifact_folder / "logistic_baseline.pkl", 'rb') as f:
        model_data = pickle.load(f)
    
    # Mock model wrapper
    class ModelWrapper:
        def __init__(self, model_data):
            self.vectorizer = model_data['vectorizer']
            self.models = model_data['models']
            self.mlb = model_data['mlb']
        
        def predict_proba(self, texts):
            X_vec = self.vectorizer.transform(texts)
            probas = np.zeros((len(texts), len(self.mlb.classes_)))
            for i, class_name in enumerate(self.mlb.classes_):
                probas[:, i] = self.models[class_name].predict_proba(X_vec)[:, 1]
            return probas
    
    model = ModelWrapper(model_data)
    explainer = LIMEExplainer(model, model_data['mlb'].classes_)
    
    # Test
    test_text = "Market volatility and economic uncertainty may adversely affect our business."
    explanation = explainer.explain(test_text)
    
    print("LIME Explanation:")
    for class_name in explanation:
        print(f"\n{class_name}:")
        top_words = explainer.get_top_words(explanation, class_name, k=6)
        for word, weight in top_words:
            print(f"  {word}: {weight:.3f}")

LIME Explanation:

Credit Risk:
  economic: 0.040
  volatility: -0.033
  affect: 0.032
  adversely: 0.031
  Market: 0.023
  and: -0.019

Legal/Regulatory Risk:
  business: -0.023
  economic: 0.023
  and: 0.017
  volatility: -0.010
  our: 0.009
  Market: -0.002

Liquidity Risk:
  affect: 0.031
  adversely: 0.030
  economic: 0.025
  volatility: -0.020
  our: 0.008
  uncertainty: 0.007

Market Risk:
  Market: 0.060
  economic: 0.043
  and: -0.035
  volatility: -0.025
  adversely: 0.022
  affect: 0.017

Operational Risk:
  affect: 0.046
  business: 0.041
  volatility: -0.035
  adversely: 0.031
  economic: 0.013
  Market: -0.005

Reputational Risk:
  business: 0.039
  volatility: -0.032
  affect: 0.027
  adversely: 0.026
  economic: -0.025
  Market: -0.022

Strategic Risk:
  adversely: 0.022
  business: 0.021
  uncertainty: 0.018
  volatility: -0.018
  affect: 0.016
  Market: 0.014


In [None]:
class LIMEExplainerXGBoost:
    def __init__(self, model, class_names):
        self.model = model
        self.class_names = class_names
        self.explainer = LimeTextExplainer(class_names=class_names)
    
    def explain(self, text, num_features=10):
        def predict_proba_fn(texts):
            return self.model.predict_proba(texts)

        probs = self.model.predict_proba([text])[0]

        exp = self.explainer.explain_instance(
            text,
            predict_proba_fn,
            labels=list(np.where(probs > 0.0)[0]),
            num_features=num_features,
            num_samples=500
        )

        explanations = {}

        for class_idx in exp.local_exp:
            class_name = self.class_names[class_idx]
            explanations[class_name] = {
                "word_weights": exp.as_list(label=class_idx),
                "score": probs[class_idx]
            }

        return explanations

    
    def get_top_words(self, explanation, class_name, k=10):
        """Get top-k words for a class"""
        word_weights = explanation[class_name]['word_weights']
        sorted_weights = sorted(word_weights, key=lambda x: abs(x[1]), reverse=True)
        return sorted_weights[:k]

# Usage example
if __name__ == "__main__":
    import pickle
    
    # Load model
    artifact_folder = Path(__name__).resolve().parent.parent / "results/model_artifacts"
    with open(artifact_folder / "xgboost.pkl", 'rb') as f:
        model_data = pickle.load(f)
    
    # Mock model wrapper
    class ModelWrapper:
        def __init__(self, model_data):
            self.vectorizer = model_data['vectorizer']
            self.models = model_data['models']
            self.mlb = model_data['mlb']
        
        def predict_proba(self, texts):
            X_vec = self.vectorizer.transform(texts)
            probas = np.zeros((len(texts), len(self.mlb.classes_)))
            for i, class_name in enumerate(self.mlb.classes_):
                probas[:, i] = self.models[class_name].predict_proba(X_vec)[:, 1]
            return probas
    
    model = ModelWrapper(model_data)
    explainer = LIMEExplainerXGBoost(model, model_data['mlb'].classes_)
    
    # Test
    test_text = "Market volatility and economic uncertainty may adversely affect our business."
    explanation = explainer.explain(test_text)
    
    print("LIME Explanation:")
    for class_name in explanation:
        print(f"\n{class_name}:")
        top_words = explainer.get_top_words(explanation, class_name, k=6)
        for word, weight in top_words:
            print(f"  {word}: {weight:.3f}")

LIME Explanation:

Credit Risk:
  This: 0.000
  will: 0.000
  tarnish: 0.000
  our: 0.000
  image: 0.000

Legal/Regulatory Risk:
  This: 0.000
  will: 0.000
  tarnish: 0.000
  our: 0.000
  image: 0.000

Liquidity Risk:
  This: -0.000
  tarnish: -0.000
  image: -0.000
  will: -0.000
  our: 0.000

Market Risk:
  This: 0.000
  will: 0.000
  tarnish: 0.000
  our: 0.000
  image: 0.000

Operational Risk:
  This: 0.000
  tarnish: 0.000
  image: 0.000
  will: 0.000
  our: -0.000

Reputational Risk:
  This: 0.000
  tarnish: 0.000
  image: 0.000
  will: 0.000
  our: -0.000

Strategic Risk:
  This: 0.000
  will: 0.000
  tarnish: 0.000
  our: 0.000
  image: 0.000


In [30]:
# src/explainers/shap_explainer.py

class SHAPExplainer:
    def __init__(self, model, model_type='logistic'):
        """
        model_type: 'logistic', 'xgboost', or 'neural'
        """
        self.model = model
        self.model_type = model_type
        self.explainer = None
        
    def fit(self, X_train, max_samples=100):
        """Initialize explainer with background data"""
        if self.model_type == 'logistic':
            # For sklearn models - use one class as example
            X_train_vec = self.model.vectorizer.transform(X_train[:max_samples])
            # Create explainer for first class
            first_class = list(self.model.models.keys())[0]
            self.explainer = shap.LinearExplainer(
                self.model.models[first_class],
                X_train_vec
            )
            self.explainer_type = 'single'
            
        elif self.model_type == 'xgboost':
            # Use TreeExplainer for each class
            self.explainer = {}
            for class_name in self.model.models:
                self.explainer[class_name] = shap.TreeExplainer(self.model.models[class_name])
            self.explainer_type = 'multi'
            
        else:  # neural
            # Use KernelExplainer (slower but model-agnostic)
            def predict_fn(texts):
                return self.model.predict_proba(texts)
            
            self.explainer = shap.KernelExplainer(predict_fn, X_train[:max_samples])
            self.explainer_type = 'single'
    
    def explain(self, text, class_name=None):
        """Generate SHAP explanation"""
        X_vec = self.model.vectorizer.transform([text])
        
        if self.model_type == 'xgboost' and class_name:
            shap_values = self.explainer[class_name].shap_values(X_vec)
        else:
            shap_values = self.explainer.shap_values(X_vec)
        
        # Get feature names
        feature_names = self.model.vectorizer.get_feature_names_out()
        
        # Extract non-zero features
        if hasattr(shap_values, 'values'):
            values = shap_values.values[0]
        elif isinstance(shap_values, list):
            values = shap_values[0]
        else:
            values = shap_values[0]
        
        # Map to words in original text
        word_weights = []
        for idx, val in enumerate(values):
            if abs(val) > 0.001:  # Threshold
                word_weights.append((feature_names[idx], float(val)))
        
        # Sort by absolute value
        word_weights.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return word_weights
    
    def get_top_words(self, explanation, k=10):
        """Get top-k words"""
        return explanation[:k]

# Usage
if __name__ == "__main__":
    import pickle
    
    # Load model
    artifact_folder = Path(__name__).resolve().parent.parent / "results/model_artifacts"
    with open(artifact_folder / 'logistic_baseline.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    class ModelWrapper:
        def __init__(self, model_data):
            self.vectorizer = model_data['vectorizer']
            self.models = model_data['models']
            self.mlb = model_data['mlb']
        
        def predict_proba(self, texts):
            X_vec = self.vectorizer.transform(texts)
            probas = np.zeros((len(texts), len(self.mlb.classes_)))
            for i, class_name in enumerate(self.mlb.classes_):
                probas[:, i] = self.models[class_name].predict_proba(X_vec)[:, 1]
            return probas
    
    model = ModelWrapper(model_data)
    
    # Need some training data for background
    import pandas as pd
    df = pd.read_csv('../data/risk_data.csv')
    X_train = df['paragraph_cleaned'].values[:100]
    
    explainer = SHAPExplainer(model, model_type='logistic')
    explainer.fit(X_train, max_samples=50)
    
    # Test
    test_text = "Loan facilities are hard to come by therefore we may have credit risk."
    explanation = explainer.explain(test_text)
    
    print("SHAP Explanation:")
    for word, weight in explanation[:10]:
        print(f"  {word}: {weight:.4f}")

SHAP Explanation:
  credit: 0.3339
  credit risk: 0.0846
  risk: 0.0655
  loan: 0.0642
  facilities: 0.0589
  come: -0.0257
  debt: -0.0219
  financial: -0.0215
  nrg: -0.0194
  product: 0.0133


In [None]:
class SHAPExplainer:
    def __init__(self, model, model_type='logistic'):
        """
        model_type: 'logistic', 'xgboost', or 'neural'
        """
        self.model = model
        self.model_type = model_type
        self.explainer = None
        
    def fit(self, X_train, max_samples=100):
        """Initialize explainer with background data"""
        if self.model_type == 'logistic':
            # For sklearn models - use one class as example
            X_train_vec = self.model.vectorizer.transform(X_train[:max_samples])
            # Create explainer for first class
            first_class = list(self.model.models.keys())[0]
            self.explainer = shap.LinearExplainer(
                self.model.models[first_class],
                X_train_vec
            )
            self.explainer_type = 'single'
            
        elif self.model_type == 'xgboost':
            # Use TreeExplainer for each class
            self.explainer = {}
            for class_name in self.model.models:
                self.explainer[class_name] = shap.TreeExplainer(self.model.models[class_name])
            self.explainer_type = 'multi'
            
        else:  # neural
            # Use KernelExplainer (slower but model-agnostic)
            def predict_fn(texts):
                return self.model.predict_proba(texts)
            
            self.explainer = shap.KernelExplainer(predict_fn, X_train[:max_samples])
            self.explainer_type = 'single'
    
    def explain(self, text, class_name):
        X_vec = self.model.vectorizer.transform([text])
        feature_names = self.model.vectorizer.get_feature_names_out()

        if self.model_type == 'xgboost':
            explainer = self.explainer[class_name]
            shap_values = explainer.shap_values(X_vec)

            # XGBoost binary classifier â†’ shape (1, n_features)
            values = shap_values[0]

        else:
            shap_values = self.explainer.shap_values(X_vec)
            values = shap_values[0]

        word_weights = [
            (feature_names[i], float(values[i]))
            for i in range(len(values))
            if abs(values[i]) > 0.001
        ]

        word_weights.sort(key=lambda x: abs(x[1]), reverse=True)
        return word_weights
    
    def get_top_words(self, explanation, k=10):
        """Get top-k words"""
        return explanation[:k]

# Usage
if __name__ == "__main__":
    import pickle
    
    # Load model
    artifact_folder = Path(__name__).resolve().parent.parent / "results/model_artifacts"
    with open(artifact_folder / 'xgboost.pkl', 'rb') as f:
        model_data = pickle.load(f)
    
    class ModelWrapper:
        def __init__(self, model_data):
            self.vectorizer = model_data['vectorizer']
            self.models = model_data['models']
            self.mlb = model_data['mlb']
        
        def predict_proba(self, texts):
            X_vec = self.vectorizer.transform(texts)
            probas = np.zeros((len(texts), len(self.mlb.classes_)))
            for i, class_name in enumerate(self.mlb.classes_):
                probas[:, i] = self.models[class_name].predict_proba(X_vec)[:, 1]
            return probas
    
    model = ModelWrapper(model_data)
    
    # Need some training data for background
    import pandas as pd
    df = pd.read_csv('../data/risk_data.csv')
    X_train = df['paragraph_cleaned'].values[:100]
    
    explainer = SHAPExplainer(model, model_type='xgboost')
    explainer.fit(X_train, max_samples=50)
    
    # Test
    test_text = df['paragraph'].iloc[-1]
    explanation = explainer.explain(test_text, class_name="Legal/Regulatory Risk")
    
    print("SHAP Explanation:")
    for word, weight in explanation[:10]:
        print(f"  {word}: {weight:.4f}")

SHAP Explanation:
  law: 3.4330
  regulation: -0.5073
  business: 0.4261
  regulatory: -0.3301
  operation: 0.3134
  cost: -0.2651
  include: -0.2512
  adversely affect: -0.1193
  operating: 0.1049
  meet: 0.0993
