In [1]:
!pip install shap




In [7]:
import shap
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Dummy data and model for demonstration
dummy_corpus = [
    "This is a positive sentence about machine learning.",
    "This is a negative sentence about traditional methods.",
    "Another positive example, machine learning is great.",
    "Another negative example, traditional methods are slow.",
    "Neutral sentence for testing."
]
dummy_labels = np.array([1, 0, 1, 0, 1]) # 1 for positive, 0 for negative/neutral

# Initialize and fit TF-IDF vectorizer
tfidf = TfidfVectorizer()
X_dummy = tfidf.fit_transform(dummy_corpus)

# Initialize and train a dummy model
model = LogisticRegression(solver='liblinear')
model.fit(X_dummy, dummy_labels)

# Define a prediction function that takes raw text and returns probabilities
def predict_proba_from_text(texts):
    # The SHAP masker will pass a list of strings to this function
    vectorized_text = tfidf.transform(texts)
    return model.predict_proba(vectorized_text)

# create SHAP masker for text, relying on shap.maskers.Text's default tokenization
# This version of SHAP (0.50.0) does not accept custom tokenizers via 'tokenizer' or 'tokens_function'
masker = shap.maskers.Text() # Uses default space-based tokenization

# create SHAP explainer
explainer = shap.Explainer(
    predict_proba_from_text, # Our custom prediction function that takes raw text
    masker,
    output_names=["negative", "positive"] # Add output names for clarity in explanation
)

# sample text to explain
sample_text = [
    "Breaking news: Government announces new economic policy"
]

# generate SHAP values
shap_values = explainer(sample_text)

# display explanation
shap.plots.text(shap_values[0])