### Imports

In [None]:
import re
import unicodedata
from typing import Iterable, List, Callable
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from datasets import Dataset, Features, ClassLabel, Value
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, Trainer, TrainingArguments
)
from inspect import signature
import shap
import numpy as np
import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
from collections import defaultdict
from scipy.special import softmax

## Cleaning Pipeline

In [None]:
_URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
_EMAIL_RE = re.compile(r'\b[\w.+-]+@[\w-]+\.[\w.-]+\b')
_HTML_RE = re.compile(r'<[^>]+>')  
_NEWPAGE_RE = re.compile(r'<\s*NEWPAGE\s*>', flags=re.IGNORECASE)
_CTRL_RE = re.compile(r'[\u0000-\u001F\u007F]')
_MULTISPACE_RE = re.compile(r'\s+')
_DANGLING_PUNCT_RE = re.compile(r'\s+([.,;:!?])')
_WEIRD_CHARS_RE = re.compile(r'[^\w\s\-\’\'.,;:!?()“”"…–—°€£$%&/@]+', flags=re.UNICODE)

def normalize_unicode(text: str, form: str = "NFC") -> str:
    return unicodedata.normalize(form, text)

def strip_control_chars(text: str) -> str:
    return _CTRL_RE.sub(' ', text)

def replace_smart_quotes(text: str) -> str:
    replacements = {
        "“":"\"", "”":"\"", "‘":"'", "’":"'",
        "–":"-", "—":"-", "…":"...", "•":"*",
        "¬":"-", "ﬁ":"fi", "ﬂ":"fl", "½":"1/2", "¼":"1/4", "¾":"3/4"
    }
    for k,v in replacements.items():
        text = text.replace(k, v)
    return text

def dehyphenate_linebreaks(text: str) -> str:
    return re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)

def basic_whitespace(text: str) -> str:
    text = _DANGLING_PUNCT_RE.sub(r'\1', text)       
    text = _MULTISPACE_RE.sub(' ', text).strip()
    return text

def remove_urls_emails(text: str) -> str:
    text = _URL_RE.sub(' ', text)
    text = _EMAIL_RE.sub(' ', text)
    return text

def strip_html_like(text: str) -> str:
    return _HTML_RE.sub(' ', text)

def remove_garbage(text: str, keep_charset:str="broad") -> str:
    """
    keep_charset:
      - "broad": keep letters, digits, underscores, whitespace, modest punctuation
      - "letters_only": keep letters (incl. accented) + spaces + basic punctuation
    """
    if keep_charset == "broad":
        return _WEIRD_CHARS_RE.sub(' ', text)
    else:
        return re.sub(r"[^A-Za-zÀ-ÖØ-öø-ÿ\s'.,;:!?-]", " ", text)


### Light Cleaning for RoBERTa

In [None]:
def clean_for_roberta(text: str) -> str:
    x = text
    x = normalize_unicode(x, "NFC")
    x = replace_smart_quotes(x)
    x = dehyphenate_linebreaks(x)
    x = _NEWPAGE_RE.sub(' [NEWPAGE] ', x)          
    x = strip_html_like(x)
    x = remove_urls_emails(x)
    x = strip_control_chars(x)
    x = remove_garbage(x, keep_charset="broad")
    x = basic_whitespace(x)
    return x

### Heavy Cleaning for SVM

In [None]:
def clean_for_svm(text: str, lowercase:bool=True, ascii_fold:bool=True) -> str:
    x = text
    x = normalize_unicode(x, "NFKC")
    x = replace_smart_quotes(x)
    x = dehyphenate_linebreaks(x)
    x = _NEWPAGE_RE.sub(' ', x)                       
    x = strip_html_like(x)
    x = remove_urls_emails(x)
    x = strip_control_chars(x)
    x = remove_garbage(x, keep_charset="letters_only")
    if lowercase:
        x = x.lower()
    if ascii_fold:
        x = unicodedata.normalize("NFKD", x).encode("ascii","ignore").decode("ascii")
    x = basic_whitespace(x)
    return x

In [None]:
def batch_clean(texts: Iterable[str], fn: Callable[[str], str]) -> List[str]:
    return [fn(t) for t in texts]

## Data Preprocessing

### Read Files

In [None]:
art_df = pd.read_csv(r"..\Data\Art_exhibitions_gale.csv")
books_df = pd.read_csv(r"..\Data\Books_gale.csv")
concerts_df = pd.read_csv(r"..\Data\Concerts_gale.csv")
dance_df = pd.read_csv(r"..\Data\Dance_gale.csv")
operas_df = pd.read_csv(r"..\Data\Operas_gale.csv")
poetry_df = pd.read_csv(r"..\Data\Poetry_gale.csv")
theater_df = pd.read_csv(r"..\Data\Theater_gale.csv")

### Merge All Dataframes

In [None]:
sources = {
    "opera":   operas_df,
    "theater": theater_df,
    "book":    books_df,
    "art":     art_df,
    "concert": concerts_df,
    "dance":   dance_df,
    "poetry":  poetry_df,
}

frames = []
for label, df in sources.items():
    assert "Full_text" in df.columns, f"{label} is missing 'Full_text'"
    tmp = df[["Full_text"]].copy()
    tmp["label"] = label
    frames.append(tmp)

big_df = pd.concat(frames, ignore_index=True)

big_df = big_df.dropna(subset=["Full_text"]).drop_duplicates(subset=["Full_text", "label"]).reset_index(drop=True)
print("Class balance:\n", big_df["label"].value_counts())

big_df["text_roberta"] = batch_clean(big_df["Full_text"], clean_for_roberta)
big_df["text_svm"]     = batch_clean(big_df["Full_text"], clean_for_svm)

big_df.to_parquet("big_cleaned.parquet", index=False)

## SVM

In [None]:

assert {"Full_text","label"}.issubset(big_df.columns)
big_df = big_df.dropna(subset=["Full_text"]).reset_index(drop=True)

big_df["text_roberta"] = batch_clean(big_df["Full_text"], clean_for_roberta)
big_df["text_svm"]     = batch_clean(big_df["Full_text"], clean_for_svm)

print(big_df[["label","Full_text","text_roberta","text_svm"]].head(2))

X_train, X_val, y_train, y_val = train_test_split(
    big_df["text_svm"], big_df["label"],
    test_size=0.2, random_state=42, stratify=big_df["label"]
)

svm_pipeline = Pipeline([
    ("feats", FeatureUnion(transformer_list=[
        ("word_tfidf", TfidfVectorizer(
            ngram_range=(1,2), min_df=2, max_df=0.9,
            stop_words="english", sublinear_tf=True
        )),
        ("char_tfidf", TfidfVectorizer(
            analyzer="char", ngram_range=(3,5),
            min_df=2, sublinear_tf=True
        ))
    ])),
    ("clf", LinearSVC(C=1.0))
])
svm_pipeline.fit(X_train, y_train)
pred = svm_pipeline.predict(X_val)
print(classification_report(y_val, pred, digits=3))  

### RoBERTa

In [None]:
# label_names = sorted(big_df["label"].unique())
# label2id = {l:i for i,l in enumerate(label_names)}
# id2label = {i:l for l,i in label2id.items()}

# hf_df = big_df[["text_roberta","label"]].rename(columns={"text_roberta":"text"}).copy()
# hf_df["labels"] = hf_df["label"].map(label2id)

# features = Features({"text": Value("string"), "labels": ClassLabel(names=label_names)})
# ds = Dataset.from_pandas(hf_df[["text","labels"]], preserve_index=False, features=features)
# ds = ds.train_test_split(test_size=0.2, seed=42, stratify_by_column="labels")

# MODEL = "roberta-base"
# tokenizer = AutoTokenizer.from_pretrained(MODEL)

# def tok(batch):
#     return tokenizer(batch["text"], truncation=True, padding=False, max_length=256)

# tokenized = ds.map(tok, batched=True, remove_columns=["text"])
# collator = DataCollatorWithPadding(tokenizer=tokenizer)

# model = AutoModelForSequenceClassification.from_pretrained(
#     MODEL, num_labels=len(label_names), id2label=id2label, label2id=label2id
# )

# init_params = signature(TrainingArguments.__init__).parameters
# has = lambda k: k in init_params

# STRATEGY = "steps"  
# args_kwargs = dict(
#     output_dir="roberta_out",
#     learning_rate=2e-5,
#     num_train_epochs=3,
# )

# if has("per_device_train_batch_size"): args_kwargs["per_device_train_batch_size"] = 16
# if has("per_device_eval_batch_size"):  args_kwargs["per_device_eval_batch_size"]  = 32

# if has("logging_steps"): args_kwargs["logging_steps"] = 50

# if STRATEGY == "steps":
#     if has("eval_steps"): args_kwargs["eval_steps"] = 500
#     if has("save_steps"): args_kwargs["save_steps"] = 1000  
#     if has("save_strategy"): args_kwargs["save_strategy"] = "steps"
#     if has("evaluation_strategy"): args_kwargs["evaluation_strategy"] = "steps"
#     elif has("eval_strategy"): args_kwargs["eval_strategy"] = "steps"
#     elif has("evaluate_during_training"): args_kwargs["evaluate_during_training"] = True
# else:  
#     if has("save_strategy"): args_kwargs["save_strategy"] = "epoch"
#     if has("evaluation_strategy"): args_kwargs["evaluation_strategy"] = "epoch"
#     elif has("eval_strategy"): args_kwargs["eval_strategy"] = "epoch"
#     elif has("evaluate_during_training"): args_kwargs["evaluate_during_training"] = True

# enable_load_best = has("load_best_model_at_end") and (
#     has("evaluation_strategy") or has("eval_strategy") or has("evaluate_during_training")
# )
# if enable_load_best:
#     args_kwargs["load_best_model_at_end"] = True
#     if has("metric_for_best_model"): args_kwargs["metric_for_best_model"] = "eval_loss"
#     if has("save_total_limit"): args_kwargs["save_total_limit"] = 2

# args = TrainingArguments(**args_kwargs)
# print("Resolved TrainingArguments:", args)

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=tokenized["train"],
#     eval_dataset=tokenized["test"],
#     tokenizer=tokenizer,
#     data_collator=collator
# )

# trainer.train()
# try:
#     eval_res = trainer.evaluate()
#     print("Eval:", eval_res)
# except TypeError:
#     pass

## SHAP Analysis

In [None]:
def model_margin(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    return svm_pipeline.decision_function(cleaned)  
masker = shap.maskers.Text()

text_explainer = shap.Explainer(model_margin, masker)

raw_examples = big_df["Full_text"].sample(3, random_state=0).tolist()

text_explanations = text_explainer(raw_examples)  

margins = model_margin(raw_examples)
pred_idx = margins.argmax(axis=1)  

for i, raw in enumerate(raw_examples):
    cls = pred_idx[i]
    shap.plots.text(text_explanations[i, :, cls])

In [None]:
union = svm_pipeline.named_steps["feats"]
clf   = svm_pipeline.named_steps["clf"]

bg_texts = big_df["text_svm"].sample(200, random_state=42).tolist()
X_bg = union.transform(bg_texts)

lin_explainer = shap.LinearExplainer(clf, X_bg)

val_texts = big_df["text_svm"].sample(200, random_state=0).tolist()
X_val = union.transform(val_texts)


shap_vals_list = lin_explainer.shap_values(X_val)  
expected_vals  = lin_explainer.expected_value     

word_names = svm_pipeline.named_steps["feats"].transformer_list[0][1].get_feature_names_out()
char_names  = svm_pipeline.named_steps["feats"].transformer_list[1][1].get_feature_names_out()
feature_names = np.concatenate([word_names, char_names])

n_classes = shap_vals_list[0].shape[1] if shap_vals_list[0].ndim == 2 else len(shap_vals_list)
class_labels = np.array(sorted(big_df["label"].unique().tolist())) 
def top_global_shap(class_i, topk=20):
    sv = shap_vals_list[class_i]              
    mean_abs = np.abs(sv).mean(axis=0)       
    top_idx = np.argsort(-mean_abs)[:topk]
    return pd.DataFrame({
        "feature": feature_names[top_idx],
        "mean(|SHAP|)": mean_abs[top_idx]
    })

for i, cls in enumerate(class_labels):
    print(f"\n=== Top features for class: {cls} ===")
    display(top_global_shap(i, topk=15))

i = 0
x_row = X_val[i]
raw_text = val_texts[i]
pred_margin = clf.decision_function(x_row) 
pred_class = int(np.argmax(pred_margin))

row_sv = shap_vals_list[pred_class][i].toarray().ravel() if hasattr(shap_vals_list[pred_class], "toarray") \
         else shap_vals_list[pred_class][i]

pos_idx = np.argsort(-row_sv)[:15]
neg_idx = np.argsort(row_sv)[:15]

local_df = pd.DataFrame({
    "feature": np.r_[feature_names[pos_idx], feature_names[neg_idx]],
    "shap_value": np.r_[row_sv[pos_idx], row_sv[neg_idx]]
})
print(f"\nRaw text:\n{raw_text[:800]}...\n")
print(f"Predicted class: {class_labels[pred_class]}")
print("\nTop local contributors (+):")
display(local_df.iloc[:15])
print("\nTop local contributors (−):")
display(local_df.iloc[15:])

In [None]:
from pylab import rcParams
import matplotlib as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1
dark_style = {
    'figure.facecolor': '#122b38',
    'axes.facecolor': '#122b38',
    'savefig.facecolor':'#122b38',
    'axes.grid': True,
    'axes.grid.which': 'both',
    'axes.spines.left': False,
    'axes.spines.right': False,
    'axes.spines.top': False,
    'axes.spines.bottom': False,
    'grid.color': 'white',
    'grid.linewidth': '0.3',
    'text.color': '0.9',
    'axes.labelcolor': '0.9',
    'xtick.color': '0.9',
    'ytick.color': '0.9',
    'font.size': 12 }
plt.rcParams.update(dark_style)
rcParams['figure.figsize'] = (18, 4)

In [None]:
target_cls = np.where(class_labels == "theater")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'theater'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

In [None]:
target_cls = np.where(class_labels == "book")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'book'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

In [None]:
target_cls = np.where(class_labels == "concert")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'concert'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

In [None]:
target_cls = np.where(class_labels == "dance")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'dance'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

In [None]:
target_cls = np.where(class_labels == "opera")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'opera'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

In [None]:
target_cls = np.where(class_labels == "poetry")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'poetry'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

In [None]:
target_cls = np.where(class_labels == "art")[0][0]
shap_vals = shap_vals_list[target_cls]

mean_abs = np.abs(shap_vals).mean(axis=0)
top_idx = np.argsort(mean_abs)[-10:] 

X_val_top = X_val[:, top_idx]
shap_vals_top = shap_vals[:, top_idx]
feature_names_top = np.array(feature_names)[top_idx]

shap.summary_plot(
    shap_vals_top,
    features=X_val_top,
    feature_names=feature_names_top,
    show=False
)

fig, ax = plt.gcf(), plt.gca()

ax.set_xlabel("SHAP value (impact on model output)", color="white", fontsize=12)
ax.set_ylabel("", color="white")
ax.set_title("Top 5 SHAP Features for 'art'", color="white", fontsize=14)
ax.tick_params(colors="white")
for spine in ax.spines.values():
    spine.set_color("white")
plt.yticks(color="white")

for coll in ax.collections:
    coll.set_facecolor("lightblue")
    coll.set_edgecolor("lightblue")

plt.show()

### LIME3

In [None]:
def lime_predict_proba(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    margins = svm_pipeline.decision_function(cleaned)  
    return softmax(margins, axis=1)

lime_explainer = LimeTextExplainer(
    class_names=class_labels,
    split_expression=r"\W+",
    bow=True
)

target_class_name = "theater"
target_class_idx = np.where(class_labels == target_class_name)[0][0]

word_importance = defaultdict(list)

n_samples_to_explain = 50
num_features = 20

for text in val_texts[:n_samples_to_explain]:
    explanation = lime_explainer.explain_instance(
        text_instance=text,
        classifier_fn=lime_predict_proba,
        labels=[target_class_idx],
        num_features=num_features
    )

    for word, weight in explanation.as_list(label=target_class_idx):
        word_importance[word].append(weight)

lime_word_scores = {
    word: np.mean(np.abs(weights))
    for word, weights in word_importance.items()
}

top_k = 10
top_words = sorted(
    lime_word_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:top_k]

print(f"\nTop {top_k} LIME words for class '{target_class_name}':\n")
for word, score in top_words:
    print(f"{word:<15} {score:.4f}")


In [None]:
def lime_predict_proba(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    margins = svm_pipeline.decision_function(cleaned)  
    return softmax(margins, axis=1)

lime_explainer = LimeTextExplainer(
    class_names=class_labels,
    split_expression=r"\W+",
    bow=True
)

target_class_name = "art"
target_class_idx = np.where(class_labels == target_class_name)[0][0]

word_importance = defaultdict(list)

n_samples_to_explain = 50
num_features = 20

for text in val_texts[:n_samples_to_explain]:
    explanation = lime_explainer.explain_instance(
        text_instance=text,
        classifier_fn=lime_predict_proba,
        labels=[target_class_idx],
        num_features=num_features
    )

    for word, weight in explanation.as_list(label=target_class_idx):
        word_importance[word].append(weight)

lime_word_scores = {
    word: np.mean(np.abs(weights))
    for word, weights in word_importance.items()
}

top_k = 10
top_words = sorted(
    lime_word_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:top_k]

print(f"\nTop {top_k} LIME words for class '{target_class_name}':\n")
for word, score in top_words:
    print(f"{word:<15} {score:.4f}")


In [None]:
def lime_predict_proba(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    margins = svm_pipeline.decision_function(cleaned)  
    return softmax(margins, axis=1)

lime_explainer = LimeTextExplainer(
    class_names=class_labels,
    split_expression=r"\W+",
    bow=True
)

target_class_name = "concert"
target_class_idx = np.where(class_labels == target_class_name)[0][0]

word_importance = defaultdict(list)

n_samples_to_explain = 50
num_features = 20

for text in val_texts[:n_samples_to_explain]:
    explanation = lime_explainer.explain_instance(
        text_instance=text,
        classifier_fn=lime_predict_proba,
        labels=[target_class_idx],
        num_features=num_features
    )

    for word, weight in explanation.as_list(label=target_class_idx):
        word_importance[word].append(weight)

lime_word_scores = {
    word: np.mean(np.abs(weights))
    for word, weights in word_importance.items()
}

top_k = 10
top_words = sorted(
    lime_word_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:top_k]

print(f"\nTop {top_k} LIME words for class '{target_class_name}':\n")
for word, score in top_words:
    print(f"{word:<15} {score:.4f}")

In [None]:
def lime_predict_proba(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    margins = svm_pipeline.decision_function(cleaned)  
    return softmax(margins, axis=1)

lime_explainer = LimeTextExplainer(
    class_names=class_labels,
    split_expression=r"\W+",
    bow=True
)

target_class_name = "dance"
target_class_idx = np.where(class_labels == target_class_name)[0][0]

word_importance = defaultdict(list)

n_samples_to_explain = 50
num_features = 20

for text in val_texts[:n_samples_to_explain]:
    explanation = lime_explainer.explain_instance(
        text_instance=text,
        classifier_fn=lime_predict_proba,
        labels=[target_class_idx],
        num_features=num_features
    )

    for word, weight in explanation.as_list(label=target_class_idx):
        word_importance[word].append(weight)

lime_word_scores = {
    word: np.mean(np.abs(weights))
    for word, weights in word_importance.items()
}

top_k = 10
top_words = sorted(
    lime_word_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:top_k]

print(f"\nTop {top_k} LIME words for class '{target_class_name}':\n")
for word, score in top_words:
    print(f"{word:<15} {score:.4f}")


In [None]:
def lime_predict_proba(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    margins = svm_pipeline.decision_function(cleaned)  
    return softmax(margins, axis=1)

lime_explainer = LimeTextExplainer(
    class_names=class_labels,
    split_expression=r"\W+",
    bow=True
)

target_class_name = "opera"
target_class_idx = np.where(class_labels == target_class_name)[0][0]

word_importance = defaultdict(list)

n_samples_to_explain = 50
num_features = 20

for text in val_texts[:n_samples_to_explain]:
    explanation = lime_explainer.explain_instance(
        text_instance=text,
        classifier_fn=lime_predict_proba,
        labels=[target_class_idx],
        num_features=num_features
    )

    for word, weight in explanation.as_list(label=target_class_idx):
        word_importance[word].append(weight)

lime_word_scores = {
    word: np.mean(np.abs(weights))
    for word, weights in word_importance.items()
}

top_k = 10
top_words = sorted(
    lime_word_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:top_k]

print(f"\nTop {top_k} LIME words for class '{target_class_name}':\n")
for word, score in top_words:
    print(f"{word:<15} {score:.4f}")


In [None]:
def lime_predict_proba(raw_texts):
    cleaned = [clean_for_svm(t) for t in raw_texts]
    margins = svm_pipeline.decision_function(cleaned)  
    return softmax(margins, axis=1)

lime_explainer = LimeTextExplainer(
    class_names=class_labels,
    split_expression=r"\W+",
    bow=True
)

target_class_name = "poetry"
target_class_idx = np.where(class_labels == target_class_name)[0][0]

word_importance = defaultdict(list)

n_samples_to_explain = 50
num_features = 20

for text in val_texts[:n_samples_to_explain]:
    explanation = lime_explainer.explain_instance(
        text_instance=text,
        classifier_fn=lime_predict_proba,
        labels=[target_class_idx],
        num_features=num_features
    )

    for word, weight in explanation.as_list(label=target_class_idx):
        word_importance[word].append(weight)

lime_word_scores = {
    word: np.mean(np.abs(weights))
    for word, weights in word_importance.items()
}

top_k = 10
top_words = sorted(
    lime_word_scores.items(),
    key=lambda x: x[1],
    reverse=True
)[:top_k]

print(f"\nTop {top_k} LIME words for class '{target_class_name}':\n")
for word, score in top_words:
    print(f"{word:<15} {score:.4f}")
