In [2]:
# imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import EvalPrediction
from transformers import Trainer
from transformers import TrainingArguments
import evaluate
import torch
from typing import Any
from typing import Dict
from typing import Optional
from torch.utils.data import Dataset
from datasets import load_dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Load and preprocess News dataset

In [3]:
data = load_dataset('masakhane/masakhanews', 'swa') 

display(data['train'][0])

def change_label(data_point):
    if data_point["label"] == 3:
        data_point["label"] = 1
    else:
        data_point["label"] = 0
    return data_point

politics_label_data = data.map(change_label)

display(politics_label_data)
display(politics_label_data["train"][:5])

{'label': 5,
 'headline': 'Tetesi za soka Ulaya Jumatatu 26.04.2021: Varane, Camara, Nagelsmann, Willock, Azpilicueta',
 'text': 'Chelsea wapo mbele ya  Manchester United na Paris St-Germain katika mbio za kutaka kumsajili beki wa Real Madrid na timu ya taifa ya Ufaransa Raphael Varane, 28. (Mundo Deportivo - in Spanish) Beki wa Guinea Ali Camara, 23, ambaye anachezea klabu ya Young Boys ya Switzerland, amezivutia klabu kadhaa za Ligi ya Primia zikiwemo  Liverpool, Arsenal, Crystal Palace, West Ham United na Norwich. (Team Talk) Bayern Munich imeanzisha mazungumzo ya kutaka kumsajili kocha wa RB Leipzig Julian Nagelsmann. (Independent) Arsenal wapo njia panda juu ya mustakabali wa kiungo wao Joe Willock, 21, ambaye yupo Newcastle kwa mkopo. Arsenal wanahitaji kuuza baadhi ya wachezaji ili kujiimarisha kifedha. (Football London) Kocha wa Atletico Madrid  Diego Simeone ana nia ya kumsajili beki raia wa Uhispania Cesar Azpilicueta, 31, kutoka  Chelsea. (El Gol Digital - in Spanish) Manche

DatasetDict({
    train: Dataset({
        features: ['label', 'headline', 'text', 'headline_text', 'url'],
        num_rows: 1658
    })
    validation: Dataset({
        features: ['label', 'headline', 'text', 'headline_text', 'url'],
        num_rows: 237
    })
    test: Dataset({
        features: ['label', 'headline', 'text', 'headline_text', 'url'],
        num_rows: 476
    })
})

{'label': [0, 0, 1, 0, 0],
 'headline': ['Tetesi za soka Ulaya Jumatatu 26.04.2021: Varane, Camara, Nagelsmann, Willock, Azpilicueta',
  'Je chanjo ya corona ni salama?',
  'Matokeo ya uchaguzi Marekani 2020: Donald Trump amfuta kazi Waziri wa Ulinzi Mark Esper',
  'Je wajua mwanamke na mwanaume hawapaswi kufanya mazoezi pamoja?',
  'Watoto waliolazimika kuwa kimya kuhusu baba zao wakutana na maaskofu jijini Paris'],
 'text': ['Chelsea wapo mbele ya  Manchester United na Paris St-Germain katika mbio za kutaka kumsajili beki wa Real Madrid na timu ya taifa ya Ufaransa Raphael Varane, 28. (Mundo Deportivo - in Spanish) Beki wa Guinea Ali Camara, 23, ambaye anachezea klabu ya Young Boys ya Switzerland, amezivutia klabu kadhaa za Ligi ya Primia zikiwemo  Liverpool, Arsenal, Crystal Palace, West Ham United na Norwich. (Team Talk) Bayern Munich imeanzisha mazungumzo ya kutaka kumsajili kocha wa RB Leipzig Julian Nagelsmann. (Independent) Arsenal wapo njia panda juu ya mustakabali wa kiungo w

## Load Pre-Trained Model
### AfriBerta

In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification, PreTrainedModel

tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
model = AutoModelForSequenceClassification.from_pretrained("castorini/afriberta_base", num_labels=2, problem_type = "single_label_classification")

tokenizer.model_max_length = 512

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at castorini/afriberta_base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenize Data

In [9]:

def tokenize_function(datapoints):
    return tokenizer(datapoints["headline_text"], padding="max_length", truncation=True)


tokenized_datasets = politics_label_data.map(tokenize_function, batched=True)

#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

tokenized_datasets['train']

Dataset({
    features: ['label', 'headline', 'text', 'headline_text', 'url', 'input_ids', 'attention_mask'],
    num_rows: 1658
})

## Train Baseline on News Dataset

In [10]:

from transformers import DataCollatorWithPadding

# Define custom loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

def compute_metrics(pred: EvalPrediction) -> Dict[str, float]:
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer= tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/624 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.19558237493038177, 'eval_accuracy': 0.9282700421940928, 'eval_f1': 0.8495575221238938, 'eval_precision': 0.7619047619047619, 'eval_recall': 0.96, 'eval_runtime': 2.7403, 'eval_samples_per_second': 86.486, 'eval_steps_per_second': 10.948, 'epoch': 1.0}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.15381917357444763, 'eval_accuracy': 0.9578059071729957, 'eval_f1': 0.9038461538461539, 'eval_precision': 0.8703703703703703, 'eval_recall': 0.94, 'eval_runtime': 2.7326, 'eval_samples_per_second': 86.732, 'eval_steps_per_second': 10.979, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 512}


{'loss': 0.1875, 'grad_norm': 0.0465824194252491, 'learning_rate': 9.935897435897435e-06, 'epoch': 2.4}


Non-default generation parameters: {'max_length': 512}


  0%|          | 0/30 [00:00<?, ?it/s]

{'eval_loss': 0.1827808916568756, 'eval_accuracy': 0.9578059071729957, 'eval_f1': 0.9056603773584906, 'eval_precision': 0.8571428571428571, 'eval_recall': 0.96, 'eval_runtime': 2.783, 'eval_samples_per_second': 85.16, 'eval_steps_per_second': 10.78, 'epoch': 3.0}
{'train_runtime': 192.5664, 'train_samples_per_second': 25.83, 'train_steps_per_second': 3.24, 'train_loss': 0.1640978012329493, 'epoch': 3.0}


TrainOutput(global_step=624, training_loss=0.1640978012329493, metrics={'train_runtime': 192.5664, 'train_samples_per_second': 25.83, 'train_steps_per_second': 3.24, 'total_flos': 875500023730176.0, 'train_loss': 0.1640978012329493, 'epoch': 3.0})

In [11]:
import re
from collections import OrderedDict
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline
import nltk



def get_top_attended_tokens(input_ids, attention_weights, top_k=5):
    # Sum attention weights across all layers and heads
    aggregated_attentions = attention_weights.sum(dim=(0, 2)).mean(dim=0)

    tokens = tokenizer.convert_ids_to_tokens(input_ids.tolist())
    token_attention_pairs = list(zip(tokens, aggregated_attentions.tolist()))
    
        # Filter out special tokens and punctuation
    filtered_pairs = [
        (token, attn) for token, attn in token_attention_pairs
        if token not in tokenizer.all_special_tokens
        and not re.match(r'[^\w\s]', token)  # Remove punctuation
    ]
    
    
    
    sorted_pairs = sorted(filtered_pairs, key=lambda x: x[1], reverse=True)
    
    unique_tokens = OrderedDict()
    for token, _ in sorted_pairs:
        if token not in unique_tokens and len(unique_tokens) < top_k:
            unique_tokens[token] = None
    
    return list(unique_tokens.keys())



def get_sentences_with_tokens(full_text, tokens):
    sentences = nltk.sent_tokenize(full_text)
    relevant_sentences = []
    for sentence in sentences:
        highlighted_sentence = sentence
        for token in tokens:
            pattern = re.compile(re.escape(token), re.IGNORECASE)
            highlighted_sentence = pattern.sub(f"**{token}**", highlighted_sentence)
        if highlighted_sentence != sentence:
            relevant_sentences.append(highlighted_sentence)
    return relevant_sentences


# Set the model to evaluation mode
model.eval()

pipe = pipeline(task ="text-classification", model = model, tokenizer=tokenizer, device=device,max_length=512,  truncation=True)
# Disable gradient calculations for inference
with torch.no_grad():
    for example in tokenized_datasets["test"]:
        
        result = pipe(example['headline_text'], truncation=True, max_length=512)
        
        
        #print(result)
        if result[0]['label'] == 'LABEL_1':
        

            # Prepare inputs
            inputs = {
                'input_ids': torch.tensor(example['input_ids']).unsqueeze(0).to(device),
                'attention_mask': torch.tensor(example['attention_mask']).unsqueeze(0).to(device),
            }
            # Handle labels
            if 'label' in example:
                labels = torch.tensor([example['label']]).to(device)
            else:
                labels = None
            # Forward pass
            outputs = model(**inputs, labels=labels, output_attentions=True)
            # Process the outputs
            attentions = outputs.attentions

            # Stack all attention layers
            all_attentions = torch.stack(attentions).squeeze(1)

            # Get top attended tokens

            top_tokens = get_top_attended_tokens(inputs['input_ids'][0], all_attentions, top_k=5)
            print(f"Top 5 attended tokens: {top_tokens}")
            # Get the full text
            full_text = tokenizer.decode(inputs['input_ids'][0])

            # Get sentences containing the top words
            relevant_sentences = get_sentences_with_tokens(full_text, top_tokens)
            
            print("Relevant sentences:")
            for sentence in relevant_sentences:
                print(f"- {sentence}")
            print("\n")
        

Top 5 attended tokens: ['iongoza', 'cheza', '1', 'sikika', 'ambatanish']
Relevant sentences:
- <s> Siku **1**00 za utawala wa Ruto: Uzito kwa rais wa Kenya ku**ambatanish**a maneno na vitendo Siku zina kasi.Na hazina huruma iwapo ndimi za viongozi wa kisiasa zinatoa cheche na ahadi na ku**sikika** kama wimbo mtamu katika masikio ya wapiga kura na wananchi kwa jumla.
- Kwa rais wa Kenya William Ruto siku zake **1**00 akiwa usukani ku**iongoza** nchi hiyo zimetimia leo tarehe 22 Disemba 2022.
- Hata hivyo sio rahisi kukadiria mafanikio ya utawala wake katika siku **1**00 bila kuzingatia mambo kadhaa.
- Kuna tofauti ya m**cheza** densi na mpiga gita-mmoja ana utaalamu wa kusuka sauti za nyaya za gita kutoa mlio wa kuvutia kuwa muziki.
- Awali ahadi zake zilikuwa rahisi kuzitoa na zili**sikika** kama wimbo kwa wafuasi wake.Lakini sasa,kibarua kimebadilika baada ya muziki kukoma na kazi ngumu ya kutekeleza ahadi hizo kuanza.


Top 5 attended tokens: ['nte', 'tics', 'orodheshwa', 'nch', 'idh