In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from captum.attr import IntegratedGradients
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
model = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)
model.eval()
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [11]:

label_mapping = {
    "LABEL_0": "negative",
    "LABEL_1": "neutral",
    "LABEL_2": "positive"
}

text = "I do not want to do it!"

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs['input_ids'].long()  
attention_mask = inputs['attention_mask']


result = sentiment_pipeline(text)
label = result[0]['label']
mapped_label = label_mapping[label]
confidence = result[0]['score']


label_to_index = {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}
target = label_to_index[label]

embedding_layer = model.roberta.embeddings  
with torch.no_grad():
    embeddings = embedding_layer(input_ids).clone().detach().requires_grad_(True)

def forward_func(embeddings, attention_mask):
    model_inputs = {
        'inputs_embeds': embeddings,
        'attention_mask': attention_mask
    }
    outputs = model(**model_inputs)
    return outputs.logits

ig = IntegratedGradients(forward_func)

attributions, delta = ig.attribute(
    inputs=embeddings,
    additional_forward_args=(attention_mask,),
    target=target,
    return_convergence_delta=True
)

attributions_sum = attributions.sum(dim=-1).squeeze(0)
attributions_sum = attributions_sum.detach().numpy()
tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
tokens = [token.replace("Ġ", "") for token in tokens]

attributions_sum = attributions_sum / np.max(np.abs(attributions_sum))
top_indices = np.argsort(np.abs(attributions_sum))[-3:]  

print(f"Text: {text}")
print(f"Predicted label: {mapped_label}")
print(f"Confidence: {confidence:.4f}")

print("\nTop 3 tokens with the strongest attributions:")

for idx in top_indices:
    token = tokens[idx]
    
    attr = attributions_sum[idx]
    print(f"{token:20} | Attribution: {attr:.4f}")



Text: I do not want to do it!
Predicted label: negative
Confidence: 0.9490

Top 3 tokens with the strongest attributions:
I                    | Attribution: 0.3822
not                  | Attribution: 0.9909
want                 | Attribution: 1.0000
