# Evaluating Transformer Models for Text Classification of Cryptocurrency News through a Sustainability Lens

In [None]:
# Login to Hugging face
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 1. Install all requirements

In [None]:
# Install transformers
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-i9umodpi
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-i9umodpi
  Resolved https://github.com/huggingface/transformers to commit eed11f34abe558c76590366a21316318e5f820ed
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# Install weights & Bias for model tracking
!pip install wandb



In [None]:
# Install other dependencies
try:
  import datasets, evaluate, accelerate
  import gradio as gr
except ModuleNotFoundError:
  %pip install -U datasets evaluate accelerate gradio # -U --> Update
  import datasets, evaluate, accelerate
  import gradio as gr

import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Using transformers version: {transformers.__version__}")
print(f"Using datasets version: {datasets.__version__}")
print(f"Using torch version: {torch.__version__}")

Using transformers version: 4.47.0.dev0
Using datasets version: 3.1.0
Using torch version: 2.5.1+cu121


### 2. Import dataset from Hugging Face
> link to data: https://huggingface.co/datasets/arad1367/sustainability_impact_crypto_data

In [None]:
from datasets import load_dataset
import random

dataset = load_dataset("arad1367/sustainability_impact_crypto_data")

# Check some random data
random_indexs = random.sample(range(len(dataset["train"])), 5)
random_samples = dataset["train"][random_indexs]

print(f"[INFO] Random samples from dataset:\n")
for item in zip(random_samples["text"], random_samples["label"]):
    print(f"Text: {item[0]} | Label: {item[1]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[INFO] Random samples from dataset:

Text: Investigations reveal that crypto companies often downplay their environmental impact in public statements. | Label: negative
Text: Environmental groups express outrage over the lack of regulations governing the environmental impact of crypto mining. | Label: negative
Text: Blockchain startup AN has committed to funding clean water projects through a percentage of transaction fees. | Label: positive
Text: Research finds that many cryptocurrency projects lack effective plans to mitigate their environmental impact. | Label: negative
Text: Blockchain technology is being tested in various sectors, including real estate, for its potential benefits. | Label: neutral


### 3. Text tokenizing function

In [None]:
def tokenize_text(examples):
    """
    Tokenize given example text and return the tokenized text.
    """
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

### Evaluation function

In [None]:
import evaluate
import numpy as np
from typing import Tuple

accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels: Tuple[np.array, np.array]):
  """
  Computes the accuracy of a model by comparing the predictions and labels.
  """
  predictions, labels = predictions_and_labels

  # Get highest prediction probability of each prediction if predictions are probabilities
  if len(predictions.shape) >= 2:
    predictions = np.argmax(predictions, axis=1)

  return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpejdaniel1000[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### 5. All steps in one shot


### 5.1. roberta-large-mnli model

In [None]:
# 1. Import necessary packages
import pprint
from pathlib import Path

import numpy as np
import torch

import datasets
import evaluate

from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from scipy.stats import entropy
import time
import psutil
from typing import Dict, List, Union


# wandb.finish()
wandb.init(project="crypto-sustainability-news-5-comparison-main", reinit=True)

# 2. Setup variables for model training and saving pipeline
DATASET_NAME = "arad1367/sustainability_impact_crypto_data"
MODEL_NAME = "FacebookAI/roberta-large-mnli"
MODEL_SAVE_DIR_NAME = "models_colab/sustainability_news_roberta-large-mnli_five_epochs"

# 3. Create a directory for saving models
# Note: This will override our existing saved model (if there is one)
print(f"[INFO] Creating directory for saving models: {MODEL_SAVE_DIR_NAME}")
model_save_dir = Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True, exist_ok=True)

# 4. Load and preprocess the dataset from Hugging Face Hub
print(f"[INFO] Downloading dataset from Hugging Face Hub, name: {DATASET_NAME}")
dataset = datasets.load_dataset(path=DATASET_NAME)

# Create mappings from id2label and label2id
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Create function to map IDs to labels in dataset
def map_labels_to_number(example):
    example["label"] = label2id[example["label"]]
    return example

# Map preprocessing function to dataset
dataset = dataset["train"].map(map_labels_to_number)

# Split the dataset into train/test sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 5. Import a tokenizer and map it to our dataset
print(f"[INFO] Tokenizing text for model training with tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
                                          use_fast=True)

# Create a preprocessing function to tokenize text
def tokenize_text(examples):
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)

# 6. Set up an evaluation metric & function to evaluate our model
accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels):
    predictions, labels = predictions_and_labels

    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)

    return accuracy_metric.compute(predictions=predictions, references=labels) # note: use "references" parameter rather than "labels"


# 7. Import a model and prepare it for training
print(f"[INFO] Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
print(f"[INFO] Model loading complete!")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.00001,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="wandb",
    hub_private_repo=False,
)

# Model tracking with wandb
wandb.config.update({
    "dataset_name": DATASET_NAME,
    "model_name": MODEL_NAME,
    "learning_rate": training_args.learning_rate,
    "weight_decay": training_args.weight_decay,
    "batch_size": training_args.per_device_train_batch_size,
    "num_epochs": training_args.num_train_epochs,
    "seed": training_args.seed
})

print(f"[INFO] Model tracking started ...")

# Setup Trainer
class MyTrainer(Trainer):
    def log(self, logs):
        super().log(logs)
        wandb.log(logs)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# 8. Train the model on our text dataset
print(f"[INFO] Commencing model training...")
results = trainer.train()

# 9. Save the trained model (*** note: this will overwrite our previous model ***)
print(f"[INFO] Model training complete, saving model to local path: {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)


# ------------------------ Push Model ------------------------------
# 10. Push the model to the Hugging Face Hub
print(f"[INFO] Uploading model to Hugging Face Hub...")
model_upload_url = trainer.push_to_hub(
    commit_message="Uploading crypto sustainability news text classifier model",
    # token="YOUR_HF_TOKEN_HERE" # requires a "write" HF token
)
print(f"[INFO] Model upload complete, model available at: {model_upload_url}")
# ------------------------ Push Model ------------------------------


# 11. Evaluate the model on the test data
print(f"[INFO] Performing evaluation on test dataset...")
predictions_all = trainer.predict(tokenized_dataset["test"])
prediction_values = predictions_all.predictions
prediction_metrics = predictions_all.metrics

wandb.log(prediction_metrics)

print(f"[INFO] Prediction metrics on the test data:")
pprint.pprint(prediction_metrics)

# Metrics
class ModernEvaluationMetrics:
    def __init__(self, model, tokenizer, dataset, id2label):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.id2label = id2label
        self.metrics = {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to appropriate device
        self.model = self.model.to(self.device)

        # Initialize modern evaluation metrics
        self.accuracy = evaluate.load("accuracy")
        self.precision = evaluate.load("precision")
        self.recall = evaluate.load("recall")
        self.f1 = evaluate.load("f1")

    def measure_computational_cost(self, batch_size=32):
        """Measures computational cost and efficiency metrics"""
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024

        texts = self.dataset["test"]["text"][:batch_size]

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gpu_memory_before = torch.cuda.memory_allocated() / 1024 / 1024

        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / 1024 / 1024

        metrics = {
            "inference_time_per_sample": (end_time - start_time) / batch_size,
            "cpu_memory_usage_mb": end_memory - start_memory,
        }

        if torch.cuda.is_available():
            gpu_memory_after = torch.cuda.memory_allocated() / 1024 / 1024
            metrics["gpu_memory_usage_mb"] = gpu_memory_after - gpu_memory_before

        return metrics

    def measure_prediction_confidence(self, logits: np.ndarray) -> Dict[str, Union[float, int]]:
        """Analyzes model confidence and potential hallucination"""
        logits_tensor = torch.tensor(logits).to(self.device)
        softmax_probs = torch.nn.functional.softmax(logits_tensor, dim=1)

        # Move to CPU for numpy operations
        softmax_probs_cpu = softmax_probs.cpu().numpy()
        prediction_entropy = entropy(softmax_probs_cpu, axis=1)
        confidence_scores = softmax_probs.max(dim=1).values.cpu().numpy()

        # Calculate uncertainty threshold using standard deviation
        uncertainty_threshold = np.mean(prediction_entropy) + np.std(prediction_entropy)

        return {
            "mean_confidence": float(np.mean(confidence_scores)),
            "mean_entropy": float(np.mean(prediction_entropy)),
            "high_uncertainty_samples": int(np.sum(prediction_entropy > uncertainty_threshold)),
            "low_confidence_predictions": int(np.sum(confidence_scores < 0.7)),
            "confidence_std": float(np.std(confidence_scores)),
            "entropy_std": float(np.std(prediction_entropy))
        }

    def measure_faithfulness(self, n_samples=100):
        """Measures model faithfulness through input perturbation tests"""
        test_texts = self.dataset["test"]["text"][:n_samples]
        original_predictions = []
        perturbed_predictions = []
        confidence_changes = []

        for text in test_texts:
            # Original prediction
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            original_logits = output.logits
            original_pred = torch.argmax(original_logits, dim=1).item()
            original_conf = torch.nn.functional.softmax(original_logits, dim=1).max().cpu().item()
            original_predictions.append(original_pred)

            # Create perturbed text
            words = text.split()
            n_remove = max(1, int(len(words) * 0.2))
            remove_indices = np.random.choice(len(words), n_remove, replace=False)
            perturbed_text = " ".join([w for i, w in enumerate(words) if i not in remove_indices])

            # Perturbed prediction
            inputs = self.tokenizer(perturbed_text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            perturbed_logits = output.logits
            perturbed_pred = torch.argmax(perturbed_logits, dim=1).item()
            perturbed_conf = torch.nn.functional.softmax(perturbed_logits, dim=1).max().cpu().item()

            perturbed_predictions.append(perturbed_pred)
            confidence_changes.append(abs(original_conf - perturbed_conf))

        prediction_stability = np.mean(np.array(original_predictions) == np.array(perturbed_predictions))

        return {
            "prediction_stability": float(prediction_stability),
            "stability_score": float(prediction_stability * 100),
            "mean_confidence_change": float(np.mean(confidence_changes)),
            "confidence_change_std": float(np.std(confidence_changes))
        }

    def measure_answer_correctness(self, predictions: np.ndarray, labels: List[int]) -> Dict:
        """Comprehensive evaluation using modern metrics"""
        pred_labels = np.argmax(predictions, axis=1)

        # Calculate metrics using evaluate library
        accuracy = self.accuracy.compute(predictions=pred_labels, references=labels)
        precision = self.precision.compute(predictions=pred_labels, references=labels, average='weighted')
        recall = self.recall.compute(predictions=pred_labels, references=labels, average='weighted')
        f1 = self.f1.compute(predictions=pred_labels, references=labels, average='weighted')

        # Calculate per-class metrics
        class_metrics = {}
        for idx, class_name in self.id2label.items():
            class_predictions = (pred_labels == idx).astype(int)
            class_labels = (np.array(labels) == idx).astype(int)

            class_metrics[class_name] = {
                "precision": float(self.precision.compute(predictions=class_predictions, references=class_labels)['precision']),
                "recall": float(self.recall.compute(predictions=class_predictions, references=class_labels)['recall']),
                "f1": float(self.f1.compute(predictions=class_predictions, references=class_labels)['f1'])
            }

        return {
            "overall_accuracy": accuracy['accuracy'],
            "weighted_precision": precision['precision'],
            "weighted_recall": recall['recall'],
            "weighted_f1": f1['f1'],
            "per_class_metrics": class_metrics
        }

    def evaluate_all(self):
        """Run all evaluation metrics and return comprehensive results"""
        test_texts = self.dataset["test"]["text"]
        test_labels = self.dataset["test"]["label"]

        inputs = self.tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.cpu().numpy()

        self.metrics["computational_cost"] = self.measure_computational_cost()
        self.metrics["confidence_analysis"] = self.measure_prediction_confidence(predictions)
        self.metrics["faithfulness"] = self.measure_faithfulness()
        self.metrics["correctness"] = self.measure_answer_correctness(predictions, test_labels)

        return self.metrics

# Initialize and run comprehensive evaluation
evaluator = ModernEvaluationMetrics(model, tokenizer, tokenized_dataset, id2label)
evaluation_results = evaluator.evaluate_all()

# Log results to wandb
wandb.log({
    "advanced_metrics": evaluation_results
})

# Print detailed results
print(evaluation_results)

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mpejdaniel1000[0m. Use [1m`wandb login --relogin`[0m to force relogin


[INFO] Creating directory for saving models: models_colab/sustainability_news_roberta-large-mnli_five_epochs
[INFO] Downloading dataset from Hugging Face Hub, name: arad1367/sustainability_impact_crypto_data


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

[INFO] Tokenizing text for model training with tokenizer: FacebookAI/roberta-large-mnli


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

[INFO] Loading model: FacebookAI/roberta-large-mnli


model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at FacebookAI/roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = MyTrainer(


[INFO] Model loading complete!
[INFO] Model tracking started ...
[INFO] Commencing model training...




Epoch,Training Loss,Validation Loss,Accuracy
1,1.1736,0.938152,0.583333
2,0.5799,0.31281,0.95
3,0.1978,0.065173,0.983333
4,0.0455,0.018596,1.0
5,0.0161,0.012381,1.0


[INFO] Model training complete, saving model to local path: models_colab/sustainability_news_roberta-large-mnli_five_epochs
[INFO] Uploading model to Hugging Face Hub...


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

[INFO] Model upload complete, model available at: https://huggingface.co/arad1367/sustainability_news_roberta-large-mnli_five_epochs/tree/main/
[INFO] Performing evaluation on test dataset...


[INFO] Prediction metrics on the test data:
{'test_accuracy': 1.0,
 'test_loss': 0.012381193228065968,
 'test_runtime': 0.1365,
 'test_samples_per_second': 439.5,
 'test_steps_per_second': 14.65}


Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'computational_cost': {'inference_time_per_sample': 0.003427363932132721, 'cpu_memory_usage_mb': 0.0, 'gpu_memory_usage_mb': 0.01318359375}, 'confidence_analysis': {'mean_confidence': 0.9884418249130249, 'mean_entropy': 0.05006236210465431, 'high_uncertainty_samples': 2, 'low_confidence_predictions': 0, 'confidence_std': 0.03493757173418999, 'entropy_std': 0.07914083451032639}, 'faithfulness': {'prediction_stability': 0.9833333333333333, 'stability_score': 98.33333333333333, 'mean_confidence_change': 0.01726954976717631, 'confidence_change_std': 0.05526188991876136}, 'correctness': {'overall_accuracy': 1.0, 'weighted_precision': 1.0, 'weighted_recall': 1.0, 'weighted_f1': 1.0, 'per_class_metrics': {'negative': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'neutral': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 'positive': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}}}}


0,1
epoch,▁▁▃▃▅▅▆▆███
eval/accuracy,▁▇███
eval/loss,█▃▁▁▁
eval/runtime,█▇▁▆▅
eval/samples_per_second,▁▂█▃▄
eval/steps_per_second,▁▂█▃▄
eval_accuracy,▁▇███
eval_loss,█▃▁▁▁
eval_runtime,█▇▁▆▅
eval_samples_per_second,▁▂█▃▄

0,1
epoch,5.0
eval/accuracy,1.0
eval/loss,0.01238
eval/runtime,0.1312
eval/samples_per_second,457.148
eval/steps_per_second,15.238
eval_accuracy,1.0
eval_loss,0.01238
eval_runtime,0.1312
eval_samples_per_second,457.148


### 5.2.distilbert/distilbert-base-uncased

In [None]:
# 1. Import necessary packages
import pprint
from pathlib import Path

import numpy as np
import torch

import datasets
import evaluate

from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from scipy.stats import entropy
import time
import psutil
from typing import Dict, List, Union


# wandb.finish()
wandb.init(project="crypto-sustainability-news-5-comparison-main", reinit=True)

# 2. Setup variables for model training and saving pipeline
DATASET_NAME = "arad1367/sustainability_impact_crypto_data"
MODEL_NAME = "distilbert/distilbert-base-uncased"
MODEL_SAVE_DIR_NAME = "models_colab/distilbert_sustainability_news_five_epochs"

# 3. Create a directory for saving models
# Note: This will override our existing saved model (if there is one)
print(f"[INFO] Creating directory for saving models: {MODEL_SAVE_DIR_NAME}")
model_save_dir = Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True, exist_ok=True)

# 4. Load and preprocess the dataset from Hugging Face Hub
print(f"[INFO] Downloading dataset from Hugging Face Hub, name: {DATASET_NAME}")
dataset = datasets.load_dataset(path=DATASET_NAME)

# Create mappings from id2label and label2id
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Create function to map IDs to labels in dataset
def map_labels_to_number(example):
    example["label"] = label2id[example["label"]]
    return example

# Map preprocessing function to dataset
dataset = dataset["train"].map(map_labels_to_number)

# Split the dataset into train/test sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 5. Import a tokenizer and map it to our dataset
print(f"[INFO] Tokenizing text for model training with tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
                                          use_fast=True)

# Create a preprocessing function to tokenize text
def tokenize_text(examples):
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)

# 6. Set up an evaluation metric & function to evaluate our model
accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels):
    predictions, labels = predictions_and_labels

    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)

    return accuracy_metric.compute(predictions=predictions, references=labels) # note: use "references" parameter rather than "labels"


# 7. Import a model and prepare it for training
print(f"[INFO] Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
print(f"[INFO] Model loading complete!")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.00001,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="wandb",
    hub_private_repo=False,
)

# Model tracking with wandb
wandb.config.update({
    "dataset_name": DATASET_NAME,
    "model_name": MODEL_NAME,
    "learning_rate": training_args.learning_rate,
    "weight_decay": training_args.weight_decay,
    "batch_size": training_args.per_device_train_batch_size,
    "num_epochs": training_args.num_train_epochs,
    "seed": training_args.seed
})

print(f"[INFO] Model tracking started ...")

# Setup Trainer
class MyTrainer(Trainer):
    def log(self, logs):
        super().log(logs)
        wandb.log(logs)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# 8. Train the model on our text dataset
print(f"[INFO] Commencing model training...")
results = trainer.train()

# 9. Save the trained model (*** note: this will overwrite our previous model ***)
print(f"[INFO] Model training complete, saving model to local path: {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)


# ------------------------ Push Model ------------------------------
# 10. Push the model to the Hugging Face Hub
# print(f"[INFO] Uploading model to Hugging Face Hub...")
# model_upload_url = trainer.push_to_hub(
#     commit_message="Uploading crypto sustainability news text classifier model",
#     # token="YOUR_HF_TOKEN_HERE" # requires a "write" HF token
# )
# print(f"[INFO] Model upload complete, model available at: {model_upload_url}")
# ------------------------ Push Model ------------------------------


# 11. Evaluate the model on the test data
print(f"[INFO] Performing evaluation on test dataset...")
predictions_all = trainer.predict(tokenized_dataset["test"])
prediction_values = predictions_all.predictions
prediction_metrics = predictions_all.metrics

wandb.log(prediction_metrics)

print(f"[INFO] Prediction metrics on the test data:")
pprint.pprint(prediction_metrics)

# Metrics
class ModernEvaluationMetrics:
    def __init__(self, model, tokenizer, dataset, id2label):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.id2label = id2label
        self.metrics = {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to appropriate device
        self.model = self.model.to(self.device)

        # Initialize modern evaluation metrics
        self.accuracy = evaluate.load("accuracy")
        self.precision = evaluate.load("precision")
        self.recall = evaluate.load("recall")
        self.f1 = evaluate.load("f1")

    def measure_computational_cost(self, batch_size=32):
        """Measures computational cost and efficiency metrics"""
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024

        texts = self.dataset["test"]["text"][:batch_size]

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gpu_memory_before = torch.cuda.memory_allocated() / 1024 / 1024

        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / 1024 / 1024

        metrics = {
            "inference_time_per_sample": (end_time - start_time) / batch_size,
            "cpu_memory_usage_mb": end_memory - start_memory,
        }

        if torch.cuda.is_available():
            gpu_memory_after = torch.cuda.memory_allocated() / 1024 / 1024
            metrics["gpu_memory_usage_mb"] = gpu_memory_after - gpu_memory_before

        return metrics

    def measure_prediction_confidence(self, logits: np.ndarray) -> Dict[str, Union[float, int]]:
        """Analyzes model confidence and potential hallucination"""
        logits_tensor = torch.tensor(logits).to(self.device)
        softmax_probs = torch.nn.functional.softmax(logits_tensor, dim=1)

        # Move to CPU for numpy operations
        softmax_probs_cpu = softmax_probs.cpu().numpy()
        prediction_entropy = entropy(softmax_probs_cpu, axis=1)
        confidence_scores = softmax_probs.max(dim=1).values.cpu().numpy()

        # Calculate uncertainty threshold using standard deviation
        uncertainty_threshold = np.mean(prediction_entropy) + np.std(prediction_entropy)

        return {
            "mean_confidence": float(np.mean(confidence_scores)),
            "mean_entropy": float(np.mean(prediction_entropy)),
            "high_uncertainty_samples": int(np.sum(prediction_entropy > uncertainty_threshold)),
            "low_confidence_predictions": int(np.sum(confidence_scores < 0.7)),
            "confidence_std": float(np.std(confidence_scores)),
            "entropy_std": float(np.std(prediction_entropy))
        }

    def measure_faithfulness(self, n_samples=100):
        """Measures model faithfulness through input perturbation tests"""
        test_texts = self.dataset["test"]["text"][:n_samples]
        original_predictions = []
        perturbed_predictions = []
        confidence_changes = []

        for text in test_texts:
            # Original prediction
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            original_logits = output.logits
            original_pred = torch.argmax(original_logits, dim=1).item()
            original_conf = torch.nn.functional.softmax(original_logits, dim=1).max().cpu().item()
            original_predictions.append(original_pred)

            # Create perturbed text
            words = text.split()
            n_remove = max(1, int(len(words) * 0.2))
            remove_indices = np.random.choice(len(words), n_remove, replace=False)
            perturbed_text = " ".join([w for i, w in enumerate(words) if i not in remove_indices])

            # Perturbed prediction
            inputs = self.tokenizer(perturbed_text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            perturbed_logits = output.logits
            perturbed_pred = torch.argmax(perturbed_logits, dim=1).item()
            perturbed_conf = torch.nn.functional.softmax(perturbed_logits, dim=1).max().cpu().item()

            perturbed_predictions.append(perturbed_pred)
            confidence_changes.append(abs(original_conf - perturbed_conf))

        prediction_stability = np.mean(np.array(original_predictions) == np.array(perturbed_predictions))

        return {
            "prediction_stability": float(prediction_stability),
            "stability_score": float(prediction_stability * 100),
            "mean_confidence_change": float(np.mean(confidence_changes)),
            "confidence_change_std": float(np.std(confidence_changes))
        }

    def measure_answer_correctness(self, predictions: np.ndarray, labels: List[int]) -> Dict:
        """Comprehensive evaluation using modern metrics"""
        pred_labels = np.argmax(predictions, axis=1)

        # Calculate metrics using evaluate library
        accuracy = self.accuracy.compute(predictions=pred_labels, references=labels)
        precision = self.precision.compute(predictions=pred_labels, references=labels, average='weighted')
        recall = self.recall.compute(predictions=pred_labels, references=labels, average='weighted')
        f1 = self.f1.compute(predictions=pred_labels, references=labels, average='weighted')

        # Calculate per-class metrics
        class_metrics = {}
        for idx, class_name in self.id2label.items():
            class_predictions = (pred_labels == idx).astype(int)
            class_labels = (np.array(labels) == idx).astype(int)

            class_metrics[class_name] = {
                "precision": float(self.precision.compute(predictions=class_predictions, references=class_labels)['precision']),
                "recall": float(self.recall.compute(predictions=class_predictions, references=class_labels)['recall']),
                "f1": float(self.f1.compute(predictions=class_predictions, references=class_labels)['f1'])
            }

        return {
            "overall_accuracy": accuracy['accuracy'],
            "weighted_precision": precision['precision'],
            "weighted_recall": recall['recall'],
            "weighted_f1": f1['f1'],
            "per_class_metrics": class_metrics
        }

    def evaluate_all(self):
        """Run all evaluation metrics and return comprehensive results"""
        test_texts = self.dataset["test"]["text"]
        test_labels = self.dataset["test"]["label"]

        inputs = self.tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.cpu().numpy()

        self.metrics["computational_cost"] = self.measure_computational_cost()
        self.metrics["confidence_analysis"] = self.measure_prediction_confidence(predictions)
        self.metrics["faithfulness"] = self.measure_faithfulness()
        self.metrics["correctness"] = self.measure_answer_correctness(predictions, test_labels)

        return self.metrics

# Initialize and run comprehensive evaluation
evaluator = ModernEvaluationMetrics(model, tokenizer, tokenized_dataset, id2label)
evaluation_results = evaluator.evaluate_all()

# Log results to wandb
wandb.log({
    "advanced_metrics": evaluation_results
})

# Print detailed results
print(evaluation_results)

wandb.finish()

[INFO] Creating directory for saving models: models_colab/distilbert_sustainability_news_five_epochs
[INFO] Downloading dataset from Hugging Face Hub, name: arad1367/sustainability_impact_crypto_data
[INFO] Tokenizing text for model training with tokenizer: distilbert/distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

[INFO] Loading model: distilbert/distilbert-base-uncased


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = MyTrainer(


[INFO] Model loading complete!
[INFO] Model tracking started ...
[INFO] Commencing model training...




Epoch,Training Loss,Validation Loss,Accuracy
1,1.0823,1.046498,0.716667
2,1.0156,0.978524,0.916667
3,0.9446,0.913298,0.933333
4,0.8818,0.861459,0.933333
5,0.8491,0.841487,0.933333


[INFO] Model training complete, saving model to local path: models_colab/distilbert_sustainability_news_five_epochs
[INFO] Performing evaluation on test dataset...


[INFO] Prediction metrics on the test data:
{'test_accuracy': 0.9333333333333333,
 'test_loss': 0.8414866328239441,
 'test_runtime': 0.0396,
 'test_samples_per_second': 1516.717,
 'test_steps_per_second': 50.557}
{'computational_cost': {'inference_time_per_sample': 0.0006392896175384521, 'cpu_memory_usage_mb': 0.0, 'gpu_memory_usage_mb': 0.01513671875}, 'confidence_analysis': {'mean_confidence': 0.4357568621635437, 'mean_entropy': 1.066909670829773, 'high_uncertainty_samples': 13, 'low_confidence_predictions': 60, 'confidence_std': 0.03926420584321022, 'entropy_std': 0.019179536029696465}, 'faithfulness': {'prediction_stability': 0.95, 'stability_score': 95.0, 'mean_confidence_change': 0.016715476910273235, 'confidence_change_std': 0.01642578028100703}, 'correctness': {'overall_accuracy': 0.9333333333333333, 'weighted_precision': 0.94, 'weighted_recall': 0.9333333333333333, 'weighted_f1': 0.9349567099567099, 'per_class_metrics': {'negative': {'precision': 0.8, 'recall': 0.9230769230769

0,1
epoch,▁▁▃▃▅▅▆▆███
eval/accuracy,▁▇███
eval/loss,█▆▃▂▁
eval/runtime,▁▁▂█▁
eval/samples_per_second,██▆▁▇
eval/steps_per_second,██▆▁▇
eval_accuracy,▁▇███
eval_loss,█▆▃▂▁
eval_runtime,▁▁▂█▁
eval_samples_per_second,██▆▁▇

0,1
epoch,5.0
eval/accuracy,0.93333
eval/loss,0.84149
eval/runtime,0.0324
eval/samples_per_second,1849.898
eval/steps_per_second,61.663
eval_accuracy,0.93333
eval_loss,0.84149
eval_runtime,0.0324
eval_samples_per_second,1849.898


### 5.3. twitter-roberta-base-sentiment-latest

In [None]:
# 1. Import necessary packages
import pprint
from pathlib import Path

import numpy as np
import torch

import datasets
import evaluate

from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from scipy.stats import entropy
import time
import psutil
from typing import Dict, List, Union


# wandb.finish()
wandb.init(project="crypto-sustainability-news-5-comparison-main", reinit=True)

# 2. Setup variables for model training and saving pipeline
DATASET_NAME = "arad1367/sustainability_impact_crypto_data"
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
MODEL_SAVE_DIR_NAME = "models_colab/twitter_sustainability_news_five_epochs"

# 3. Create a directory for saving models
# Note: This will override our existing saved model (if there is one)
print(f"[INFO] Creating directory for saving models: {MODEL_SAVE_DIR_NAME}")
model_save_dir = Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True, exist_ok=True)

# 4. Load and preprocess the dataset from Hugging Face Hub
print(f"[INFO] Downloading dataset from Hugging Face Hub, name: {DATASET_NAME}")
dataset = datasets.load_dataset(path=DATASET_NAME)

# Create mappings from id2label and label2id
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Create function to map IDs to labels in dataset
def map_labels_to_number(example):
    example["label"] = label2id[example["label"]]
    return example

# Map preprocessing function to dataset
dataset = dataset["train"].map(map_labels_to_number)

# Split the dataset into train/test sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 5. Import a tokenizer and map it to our dataset
print(f"[INFO] Tokenizing text for model training with tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
                                          use_fast=True)

# Create a preprocessing function to tokenize text
def tokenize_text(examples):
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)

# 6. Set up an evaluation metric & function to evaluate our model
accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels):
    predictions, labels = predictions_and_labels

    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)

    return accuracy_metric.compute(predictions=predictions, references=labels) # note: use "references" parameter rather than "labels"


# 7. Import a model and prepare it for training
print(f"[INFO] Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
print(f"[INFO] Model loading complete!")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.00001,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="wandb",
    hub_private_repo=False,
)

# Model tracking with wandb
wandb.config.update({
    "dataset_name": DATASET_NAME,
    "model_name": MODEL_NAME,
    "learning_rate": training_args.learning_rate,
    "weight_decay": training_args.weight_decay,
    "batch_size": training_args.per_device_train_batch_size,
    "num_epochs": training_args.num_train_epochs,
    "seed": training_args.seed
})

print(f"[INFO] Model tracking started ...")

# Setup Trainer
class MyTrainer(Trainer):
    def log(self, logs):
        super().log(logs)
        wandb.log(logs)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# 8. Train the model on our text dataset
print(f"[INFO] Commencing model training...")
results = trainer.train()

# 9. Save the trained model (*** note: this will overwrite our previous model ***)
print(f"[INFO] Model training complete, saving model to local path: {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)


# ------------------------ Push Model ------------------------------
# 10. Push the model to the Hugging Face Hub
# print(f"[INFO] Uploading model to Hugging Face Hub...")
# model_upload_url = trainer.push_to_hub(
#     commit_message="Uploading crypto sustainability news text classifier model",
#     # token="YOUR_HF_TOKEN_HERE" # requires a "write" HF token
# )
# print(f"[INFO] Model upload complete, model available at: {model_upload_url}")
# ------------------------ Push Model ------------------------------


# 11. Evaluate the model on the test data
print(f"[INFO] Performing evaluation on test dataset...")
predictions_all = trainer.predict(tokenized_dataset["test"])
prediction_values = predictions_all.predictions
prediction_metrics = predictions_all.metrics

wandb.log(prediction_metrics)

print(f"[INFO] Prediction metrics on the test data:")
pprint.pprint(prediction_metrics)

# Metrics
class ModernEvaluationMetrics:
    def __init__(self, model, tokenizer, dataset, id2label):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.id2label = id2label
        self.metrics = {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to appropriate device
        self.model = self.model.to(self.device)

        # Initialize modern evaluation metrics
        self.accuracy = evaluate.load("accuracy")
        self.precision = evaluate.load("precision")
        self.recall = evaluate.load("recall")
        self.f1 = evaluate.load("f1")

    def measure_computational_cost(self, batch_size=32):
        """Measures computational cost and efficiency metrics"""
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024

        texts = self.dataset["test"]["text"][:batch_size]

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gpu_memory_before = torch.cuda.memory_allocated() / 1024 / 1024

        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / 1024 / 1024

        metrics = {
            "inference_time_per_sample": (end_time - start_time) / batch_size,
            "cpu_memory_usage_mb": end_memory - start_memory,
        }

        if torch.cuda.is_available():
            gpu_memory_after = torch.cuda.memory_allocated() / 1024 / 1024
            metrics["gpu_memory_usage_mb"] = gpu_memory_after - gpu_memory_before

        return metrics

    def measure_prediction_confidence(self, logits: np.ndarray) -> Dict[str, Union[float, int]]:
        """Analyzes model confidence and potential hallucination"""
        logits_tensor = torch.tensor(logits).to(self.device)
        softmax_probs = torch.nn.functional.softmax(logits_tensor, dim=1)

        # Move to CPU for numpy operations
        softmax_probs_cpu = softmax_probs.cpu().numpy()
        prediction_entropy = entropy(softmax_probs_cpu, axis=1)
        confidence_scores = softmax_probs.max(dim=1).values.cpu().numpy()

        # Calculate uncertainty threshold using standard deviation
        uncertainty_threshold = np.mean(prediction_entropy) + np.std(prediction_entropy)

        return {
            "mean_confidence": float(np.mean(confidence_scores)),
            "mean_entropy": float(np.mean(prediction_entropy)),
            "high_uncertainty_samples": int(np.sum(prediction_entropy > uncertainty_threshold)),
            "low_confidence_predictions": int(np.sum(confidence_scores < 0.7)),
            "confidence_std": float(np.std(confidence_scores)),
            "entropy_std": float(np.std(prediction_entropy))
        }

    def measure_faithfulness(self, n_samples=100):
        """Measures model faithfulness through input perturbation tests"""
        test_texts = self.dataset["test"]["text"][:n_samples]
        original_predictions = []
        perturbed_predictions = []
        confidence_changes = []

        for text in test_texts:
            # Original prediction
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            original_logits = output.logits
            original_pred = torch.argmax(original_logits, dim=1).item()
            original_conf = torch.nn.functional.softmax(original_logits, dim=1).max().cpu().item()
            original_predictions.append(original_pred)

            # Create perturbed text
            words = text.split()
            n_remove = max(1, int(len(words) * 0.2))
            remove_indices = np.random.choice(len(words), n_remove, replace=False)
            perturbed_text = " ".join([w for i, w in enumerate(words) if i not in remove_indices])

            # Perturbed prediction
            inputs = self.tokenizer(perturbed_text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            perturbed_logits = output.logits
            perturbed_pred = torch.argmax(perturbed_logits, dim=1).item()
            perturbed_conf = torch.nn.functional.softmax(perturbed_logits, dim=1).max().cpu().item()

            perturbed_predictions.append(perturbed_pred)
            confidence_changes.append(abs(original_conf - perturbed_conf))

        prediction_stability = np.mean(np.array(original_predictions) == np.array(perturbed_predictions))

        return {
            "prediction_stability": float(prediction_stability),
            "stability_score": float(prediction_stability * 100),
            "mean_confidence_change": float(np.mean(confidence_changes)),
            "confidence_change_std": float(np.std(confidence_changes))
        }

    def measure_answer_correctness(self, predictions: np.ndarray, labels: List[int]) -> Dict:
        """Comprehensive evaluation using modern metrics"""
        pred_labels = np.argmax(predictions, axis=1)

        # Calculate metrics using evaluate library
        accuracy = self.accuracy.compute(predictions=pred_labels, references=labels)
        precision = self.precision.compute(predictions=pred_labels, references=labels, average='weighted')
        recall = self.recall.compute(predictions=pred_labels, references=labels, average='weighted')
        f1 = self.f1.compute(predictions=pred_labels, references=labels, average='weighted')

        # Calculate per-class metrics
        class_metrics = {}
        for idx, class_name in self.id2label.items():
            class_predictions = (pred_labels == idx).astype(int)
            class_labels = (np.array(labels) == idx).astype(int)

            class_metrics[class_name] = {
                "precision": float(self.precision.compute(predictions=class_predictions, references=class_labels)['precision']),
                "recall": float(self.recall.compute(predictions=class_predictions, references=class_labels)['recall']),
                "f1": float(self.f1.compute(predictions=class_predictions, references=class_labels)['f1'])
            }

        return {
            "overall_accuracy": accuracy['accuracy'],
            "weighted_precision": precision['precision'],
            "weighted_recall": recall['recall'],
            "weighted_f1": f1['f1'],
            "per_class_metrics": class_metrics
        }

    def evaluate_all(self):
        """Run all evaluation metrics and return comprehensive results"""
        test_texts = self.dataset["test"]["text"]
        test_labels = self.dataset["test"]["label"]

        inputs = self.tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.cpu().numpy()

        self.metrics["computational_cost"] = self.measure_computational_cost()
        self.metrics["confidence_analysis"] = self.measure_prediction_confidence(predictions)
        self.metrics["faithfulness"] = self.measure_faithfulness()
        self.metrics["correctness"] = self.measure_answer_correctness(predictions, test_labels)

        return self.metrics

# Initialize and run comprehensive evaluation
evaluator = ModernEvaluationMetrics(model, tokenizer, tokenized_dataset, id2label)
evaluation_results = evaluator.evaluate_all()

# Log results to wandb
wandb.log({
    "advanced_metrics": evaluation_results
})

# Print detailed results
print(evaluation_results)

wandb.finish()

[INFO] Creating directory for saving models: models_colab/twitter_sustainability_news_five_epochs
[INFO] Downloading dataset from Hugging Face Hub, name: arad1367/sustainability_impact_crypto_data
[INFO] Tokenizing text for model training with tokenizer: cardiffnlp/twitter-roberta-base-sentiment-latest


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

[INFO] Loading model: cardiffnlp/twitter-roberta-base-sentiment-latest


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = MyTrainer(


[INFO] Model loading complete!
[INFO] Model tracking started ...
[INFO] Commencing model training...




Epoch,Training Loss,Validation Loss,Accuracy
1,0.5334,0.396069,0.833333
2,0.2383,0.180684,0.95
3,0.109,0.129601,0.966667
4,0.0532,0.100546,0.966667
5,0.0435,0.102951,0.966667


[INFO] Model training complete, saving model to local path: models_colab/twitter_sustainability_news_five_epochs
[INFO] Performing evaluation on test dataset...


[INFO] Prediction metrics on the test data:
{'test_accuracy': 0.9666666666666667,
 'test_loss': 0.10054624080657959,
 'test_runtime': 0.045,
 'test_samples_per_second': 1333.946,
 'test_steps_per_second': 44.465}
{'computational_cost': {'inference_time_per_sample': 0.0013212114572525024, 'cpu_memory_usage_mb': 0.0, 'gpu_memory_usage_mb': 0.01318359375}, 'confidence_analysis': {'mean_confidence': 0.9641101360321045, 'mean_entropy': 0.13243122398853302, 'high_uncertainty_samples': 5, 'low_confidence_predictions': 1, 'confidence_std': 0.06631326675415039, 'entropy_std': 0.13024866580963135}, 'faithfulness': {'prediction_stability': 0.95, 'stability_score': 95.0, 'mean_confidence_change': 0.023287826279799143, 'confidence_change_std': 0.07454493300383987}, 'correctness': {'overall_accuracy': 0.9666666666666667, 'weighted_precision': 0.9711111111111111, 'weighted_recall': 0.9666666666666667, 'weighted_f1': 0.9671325051759835, 'per_class_metrics': {'negative': {'precision': 0.866666666666666

0,1
epoch,▁▁▃▃▅▅▆▆███
eval/accuracy,▁▇███
eval/loss,█▃▂▁▁
eval/runtime,█▃▁▂▃
eval/samples_per_second,▁▆█▇▆
eval/steps_per_second,▁▆█▇▆
eval_accuracy,▁▇███
eval_loss,█▃▂▁▁
eval_runtime,█▃▁▂▃
eval_samples_per_second,▁▆█▇▆

0,1
epoch,5.0
eval/accuracy,0.96667
eval/loss,0.10295
eval/runtime,0.0453
eval/samples_per_second,1324.489
eval/steps_per_second,44.15
eval_accuracy,0.96667
eval_loss,0.10295
eval_runtime,0.0453
eval_samples_per_second,1324.489


### 5.4. Prompt-Guard-86M

In [None]:
# 1. Import necessary packages
import pprint
from pathlib import Path

import numpy as np
import torch

import datasets
import evaluate

from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from scipy.stats import entropy
import time
import psutil
from typing import Dict, List, Union


# wandb.finish()
wandb.init(project="crypto-sustainability-news-5-comparison-main", reinit=True)

# 2. Setup variables for model training and saving pipeline
DATASET_NAME = "arad1367/sustainability_impact_crypto_data"
MODEL_NAME = "meta-llama/Prompt-Guard-86M"
MODEL_SAVE_DIR_NAME = "models_colab/Prompt_Guard_sustainability_news_five_epochs"

# 3. Create a directory for saving models
# Note: This will override our existing saved model (if there is one)
print(f"[INFO] Creating directory for saving models: {MODEL_SAVE_DIR_NAME}")
model_save_dir = Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True, exist_ok=True)

# 4. Load and preprocess the dataset from Hugging Face Hub
print(f"[INFO] Downloading dataset from Hugging Face Hub, name: {DATASET_NAME}")
dataset = datasets.load_dataset(path=DATASET_NAME)

# Create mappings from id2label and label2id
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Create function to map IDs to labels in dataset
def map_labels_to_number(example):
    example["label"] = label2id[example["label"]]
    return example

# Map preprocessing function to dataset
dataset = dataset["train"].map(map_labels_to_number)

# Split the dataset into train/test sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 5. Import a tokenizer and map it to our dataset
print(f"[INFO] Tokenizing text for model training with tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
                                          use_fast=True)

# Create a preprocessing function to tokenize text
def tokenize_text(examples):
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)

# 6. Set up an evaluation metric & function to evaluate our model
accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels):
    predictions, labels = predictions_and_labels

    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)

    return accuracy_metric.compute(predictions=predictions, references=labels) # note: use "references" parameter rather than "labels"


# 7. Import a model and prepare it for training
print(f"[INFO] Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
print(f"[INFO] Model loading complete!")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.00001,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="wandb",
    hub_private_repo=False,
)

# Model tracking with wandb
wandb.config.update({
    "dataset_name": DATASET_NAME,
    "model_name": MODEL_NAME,
    "learning_rate": training_args.learning_rate,
    "weight_decay": training_args.weight_decay,
    "batch_size": training_args.per_device_train_batch_size,
    "num_epochs": training_args.num_train_epochs,
    "seed": training_args.seed
})

print(f"[INFO] Model tracking started ...")

# Setup Trainer
class MyTrainer(Trainer):
    def log(self, logs):
        super().log(logs)
        wandb.log(logs)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# 8. Train the model on our text dataset
print(f"[INFO] Commencing model training...")
results = trainer.train()

# 9. Save the trained model (*** note: this will overwrite our previous model ***)
print(f"[INFO] Model training complete, saving model to local path: {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)


# ------------------------ Push Model ------------------------------
# 10. Push the model to the Hugging Face Hub
# print(f"[INFO] Uploading model to Hugging Face Hub...")
# model_upload_url = trainer.push_to_hub(
#     commit_message="Uploading crypto sustainability news text classifier model",
#     # token="YOUR_HF_TOKEN_HERE" # requires a "write" HF token
# )
# print(f"[INFO] Model upload complete, model available at: {model_upload_url}")
# ------------------------ Push Model ------------------------------


# 11. Evaluate the model on the test data
print(f"[INFO] Performing evaluation on test dataset...")
predictions_all = trainer.predict(tokenized_dataset["test"])
prediction_values = predictions_all.predictions
prediction_metrics = predictions_all.metrics

wandb.log(prediction_metrics)

print(f"[INFO] Prediction metrics on the test data:")
pprint.pprint(prediction_metrics)

# Metrics
class ModernEvaluationMetrics:
    def __init__(self, model, tokenizer, dataset, id2label):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.id2label = id2label
        self.metrics = {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to appropriate device
        self.model = self.model.to(self.device)

        # Initialize modern evaluation metrics
        self.accuracy = evaluate.load("accuracy")
        self.precision = evaluate.load("precision")
        self.recall = evaluate.load("recall")
        self.f1 = evaluate.load("f1")

    def measure_computational_cost(self, batch_size=32):
        """Measures computational cost and efficiency metrics"""
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024

        texts = self.dataset["test"]["text"][:batch_size]

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gpu_memory_before = torch.cuda.memory_allocated() / 1024 / 1024

        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / 1024 / 1024

        metrics = {
            "inference_time_per_sample": (end_time - start_time) / batch_size,
            "cpu_memory_usage_mb": end_memory - start_memory,
        }

        if torch.cuda.is_available():
            gpu_memory_after = torch.cuda.memory_allocated() / 1024 / 1024
            metrics["gpu_memory_usage_mb"] = gpu_memory_after - gpu_memory_before

        return metrics

    def measure_prediction_confidence(self, logits: np.ndarray) -> Dict[str, Union[float, int]]:
        """Analyzes model confidence and potential hallucination"""
        logits_tensor = torch.tensor(logits).to(self.device)
        softmax_probs = torch.nn.functional.softmax(logits_tensor, dim=1)

        # Move to CPU for numpy operations
        softmax_probs_cpu = softmax_probs.cpu().numpy()
        prediction_entropy = entropy(softmax_probs_cpu, axis=1)
        confidence_scores = softmax_probs.max(dim=1).values.cpu().numpy()

        # Calculate uncertainty threshold using standard deviation
        uncertainty_threshold = np.mean(prediction_entropy) + np.std(prediction_entropy)

        return {
            "mean_confidence": float(np.mean(confidence_scores)),
            "mean_entropy": float(np.mean(prediction_entropy)),
            "high_uncertainty_samples": int(np.sum(prediction_entropy > uncertainty_threshold)),
            "low_confidence_predictions": int(np.sum(confidence_scores < 0.7)),
            "confidence_std": float(np.std(confidence_scores)),
            "entropy_std": float(np.std(prediction_entropy))
        }

    def measure_faithfulness(self, n_samples=100):
        """Measures model faithfulness through input perturbation tests"""
        test_texts = self.dataset["test"]["text"][:n_samples]
        original_predictions = []
        perturbed_predictions = []
        confidence_changes = []

        for text in test_texts:
            # Original prediction
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            original_logits = output.logits
            original_pred = torch.argmax(original_logits, dim=1).item()
            original_conf = torch.nn.functional.softmax(original_logits, dim=1).max().cpu().item()
            original_predictions.append(original_pred)

            # Create perturbed text
            words = text.split()
            n_remove = max(1, int(len(words) * 0.2))
            remove_indices = np.random.choice(len(words), n_remove, replace=False)
            perturbed_text = " ".join([w for i, w in enumerate(words) if i not in remove_indices])

            # Perturbed prediction
            inputs = self.tokenizer(perturbed_text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            perturbed_logits = output.logits
            perturbed_pred = torch.argmax(perturbed_logits, dim=1).item()
            perturbed_conf = torch.nn.functional.softmax(perturbed_logits, dim=1).max().cpu().item()

            perturbed_predictions.append(perturbed_pred)
            confidence_changes.append(abs(original_conf - perturbed_conf))

        prediction_stability = np.mean(np.array(original_predictions) == np.array(perturbed_predictions))

        return {
            "prediction_stability": float(prediction_stability),
            "stability_score": float(prediction_stability * 100),
            "mean_confidence_change": float(np.mean(confidence_changes)),
            "confidence_change_std": float(np.std(confidence_changes))
        }

    def measure_answer_correctness(self, predictions: np.ndarray, labels: List[int]) -> Dict:
        """Comprehensive evaluation using modern metrics"""
        pred_labels = np.argmax(predictions, axis=1)

        # Calculate metrics using evaluate library
        accuracy = self.accuracy.compute(predictions=pred_labels, references=labels)
        precision = self.precision.compute(predictions=pred_labels, references=labels, average='weighted')
        recall = self.recall.compute(predictions=pred_labels, references=labels, average='weighted')
        f1 = self.f1.compute(predictions=pred_labels, references=labels, average='weighted')

        # Calculate per-class metrics
        class_metrics = {}
        for idx, class_name in self.id2label.items():
            class_predictions = (pred_labels == idx).astype(int)
            class_labels = (np.array(labels) == idx).astype(int)

            class_metrics[class_name] = {
                "precision": float(self.precision.compute(predictions=class_predictions, references=class_labels)['precision']),
                "recall": float(self.recall.compute(predictions=class_predictions, references=class_labels)['recall']),
                "f1": float(self.f1.compute(predictions=class_predictions, references=class_labels)['f1'])
            }

        return {
            "overall_accuracy": accuracy['accuracy'],
            "weighted_precision": precision['precision'],
            "weighted_recall": recall['recall'],
            "weighted_f1": f1['f1'],
            "per_class_metrics": class_metrics
        }

    def evaluate_all(self):
        """Run all evaluation metrics and return comprehensive results"""
        test_texts = self.dataset["test"]["text"]
        test_labels = self.dataset["test"]["label"]

        inputs = self.tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.cpu().numpy()

        self.metrics["computational_cost"] = self.measure_computational_cost()
        self.metrics["confidence_analysis"] = self.measure_prediction_confidence(predictions)
        self.metrics["faithfulness"] = self.measure_faithfulness()
        self.metrics["correctness"] = self.measure_answer_correctness(predictions, test_labels)

        return self.metrics

# Initialize and run comprehensive evaluation
evaluator = ModernEvaluationMetrics(model, tokenizer, tokenized_dataset, id2label)
evaluation_results = evaluator.evaluate_all()

# Log results to wandb
wandb.log({
    "advanced_metrics": evaluation_results
})

# Print detailed results
print(evaluation_results)

wandb.finish()

[INFO] Creating directory for saving models: models_colab/Prompt_Guard_sustainability_news_five_epochs
[INFO] Downloading dataset from Hugging Face Hub, name: arad1367/sustainability_impact_crypto_data
[INFO] Tokenizing text for model training with tokenizer: meta-llama/Prompt-Guard-86M


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

[INFO] Loading model: meta-llama/Prompt-Guard-86M


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

[INFO] Model loading complete!
[INFO] Model tracking started ...


  trainer = MyTrainer(


[INFO] Commencing model training...




Epoch,Training Loss,Validation Loss,Accuracy
1,3.0569,2.356769,0.366667
2,0.9282,0.472144,0.766667
3,0.4446,0.45127,0.783333
4,0.2884,0.358443,0.833333
5,0.2428,0.307372,0.85


[INFO] Model training complete, saving model to local path: models_colab/Prompt_Guard_sustainability_news_five_epochs
[INFO] Performing evaluation on test dataset...


[INFO] Prediction metrics on the test data:
{'test_accuracy': 0.85,
 'test_loss': 0.3073720633983612,
 'test_runtime': 0.0882,
 'test_samples_per_second': 680.025,
 'test_steps_per_second': 22.668}
{'computational_cost': {'inference_time_per_sample': 0.0032536759972572327, 'cpu_memory_usage_mb': 0.0, 'gpu_memory_usage_mb': 0.0224609375}, 'confidence_analysis': {'mean_confidence': 0.884303629398346, 'mean_entropy': 0.29376164078712463, 'high_uncertainty_samples': 12, 'low_confidence_predictions': 11, 'confidence_std': 0.14989285171031952, 'entropy_std': 0.2825767397880554}, 'faithfulness': {'prediction_stability': 0.8833333333333333, 'stability_score': 88.33333333333333, 'mean_confidence_change': 0.06464515328407287, 'confidence_change_std': 0.09505472380403536}, 'correctness': {'overall_accuracy': 0.85, 'weighted_precision': 0.8841666666666668, 'weighted_recall': 0.85, 'weighted_f1': 0.8545689452666196, 'per_class_metrics': {'negative': {'precision': 0.65, 'recall': 1.0, 'f1': 0.787878

0,1
epoch,▁▁▃▃▅▅▆▆███
eval/accuracy,▁▇▇██
eval/loss,█▂▁▁▁
eval/runtime,▁█▇▄▁
eval/samples_per_second,█▁▂▅█
eval/steps_per_second,█▁▂▅█
eval_accuracy,▁▇▇██
eval_loss,█▂▁▁▁
eval_runtime,▁█▇▄▁
eval_samples_per_second,█▁▂▅█

0,1
epoch,5.0
eval/accuracy,0.85
eval/loss,0.30737
eval/runtime,0.0853
eval/samples_per_second,703.583
eval/steps_per_second,23.453
eval_accuracy,0.85
eval_loss,0.30737
eval_runtime,0.0853
eval_samples_per_second,703.583


### 5.5. finbert

In [None]:
# 1. Import necessary packages
import pprint
from pathlib import Path

import numpy as np
import torch

import datasets
import evaluate

from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from scipy.stats import entropy
import time
import psutil
from typing import Dict, List, Union


# wandb.finish()
wandb.init(project="crypto-sustainability-news-5-comparison-main", reinit=True)

# 2. Setup variables for model training and saving pipeline
DATASET_NAME = "arad1367/sustainability_impact_crypto_data"
MODEL_NAME = "ProsusAI/finbert"
MODEL_SAVE_DIR_NAME = "models_colab/finbert_sustainability_news_five_epochs"

# 3. Create a directory for saving models
# Note: This will override our existing saved model (if there is one)
print(f"[INFO] Creating directory for saving models: {MODEL_SAVE_DIR_NAME}")
model_save_dir = Path(MODEL_SAVE_DIR_NAME)
model_save_dir.mkdir(parents=True, exist_ok=True)

# 4. Load and preprocess the dataset from Hugging Face Hub
print(f"[INFO] Downloading dataset from Hugging Face Hub, name: {DATASET_NAME}")
dataset = datasets.load_dataset(path=DATASET_NAME)

# Create mappings from id2label and label2id
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Create function to map IDs to labels in dataset
def map_labels_to_number(example):
    example["label"] = label2id[example["label"]]
    return example

# Map preprocessing function to dataset
dataset = dataset["train"].map(map_labels_to_number)

# Split the dataset into train/test sets
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 5. Import a tokenizer and map it to our dataset
print(f"[INFO] Tokenizing text for model training with tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=MODEL_NAME,
                                          use_fast=True)

# Create a preprocessing function to tokenize text
def tokenize_text(examples):
    return tokenizer(examples["text"],
                     padding=True,
                     truncation=True)

tokenized_dataset = dataset.map(function=tokenize_text,
                                batched=True,
                                batch_size=1000)

# 6. Set up an evaluation metric & function to evaluate our model
accuracy_metric = evaluate.load("accuracy")

def compute_accuracy(predictions_and_labels):
    predictions, labels = predictions_and_labels

    if len(predictions.shape) >= 2:
        predictions = np.argmax(predictions, axis=1)

    return accuracy_metric.compute(predictions=predictions, references=labels) # note: use "references" parameter rather than "labels"


# 7. Import a model and prepare it for training
print(f"[INFO] Loading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)
print(f"[INFO] Model loading complete!")

# Create training arguments
training_args = TrainingArguments(
    output_dir=model_save_dir,
    learning_rate=0.00001,
    weight_decay=0.01,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="wandb",
    hub_private_repo=False,
)

# Model tracking with wandb
wandb.config.update({
    "dataset_name": DATASET_NAME,
    "model_name": MODEL_NAME,
    "learning_rate": training_args.learning_rate,
    "weight_decay": training_args.weight_decay,
    "batch_size": training_args.per_device_train_batch_size,
    "num_epochs": training_args.num_train_epochs,
    "seed": training_args.seed
})

print(f"[INFO] Model tracking started ...")

# Setup Trainer
class MyTrainer(Trainer):
    def log(self, logs):
        super().log(logs)
        wandb.log(logs)

trainer = MyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_accuracy
)

# 8. Train the model on our text dataset
print(f"[INFO] Commencing model training...")
results = trainer.train()

# 9. Save the trained model (*** note: this will overwrite our previous model ***)
print(f"[INFO] Model training complete, saving model to local path: {model_save_dir}")
trainer.save_model(output_dir=model_save_dir)


# ------------------------ Push Model ------------------------------
# 10. Push the model to the Hugging Face Hub
# print(f"[INFO] Uploading model to Hugging Face Hub...")
# model_upload_url = trainer.push_to_hub(
#     commit_message="Uploading crypto sustainability news text classifier model",
#     # token="YOUR_HF_TOKEN_HERE" # requires a "write" HF token
# )
# print(f"[INFO] Model upload complete, model available at: {model_upload_url}")
# ------------------------ Push Model ------------------------------


# 11. Evaluate the model on the test data
print(f"[INFO] Performing evaluation on test dataset...")
predictions_all = trainer.predict(tokenized_dataset["test"])
prediction_values = predictions_all.predictions
prediction_metrics = predictions_all.metrics

wandb.log(prediction_metrics)

print(f"[INFO] Prediction metrics on the test data:")
pprint.pprint(prediction_metrics)

# Metrics
class ModernEvaluationMetrics:
    def __init__(self, model, tokenizer, dataset, id2label):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.id2label = id2label
        self.metrics = {}
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Move model to appropriate device
        self.model = self.model.to(self.device)

        # Initialize modern evaluation metrics
        self.accuracy = evaluate.load("accuracy")
        self.precision = evaluate.load("precision")
        self.recall = evaluate.load("recall")
        self.f1 = evaluate.load("f1")

    def measure_computational_cost(self, batch_size=32):
        """Measures computational cost and efficiency metrics"""
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / 1024 / 1024

        texts = self.dataset["test"]["text"][:batch_size]

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gpu_memory_before = torch.cuda.memory_allocated() / 1024 / 1024

        inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        end_time = time.time()
        end_memory = psutil.Process().memory_info().rss / 1024 / 1024

        metrics = {
            "inference_time_per_sample": (end_time - start_time) / batch_size,
            "cpu_memory_usage_mb": end_memory - start_memory,
        }

        if torch.cuda.is_available():
            gpu_memory_after = torch.cuda.memory_allocated() / 1024 / 1024
            metrics["gpu_memory_usage_mb"] = gpu_memory_after - gpu_memory_before

        return metrics

    def measure_prediction_confidence(self, logits: np.ndarray) -> Dict[str, Union[float, int]]:
        """Analyzes model confidence and potential hallucination"""
        logits_tensor = torch.tensor(logits).to(self.device)
        softmax_probs = torch.nn.functional.softmax(logits_tensor, dim=1)

        # Move to CPU for numpy operations
        softmax_probs_cpu = softmax_probs.cpu().numpy()
        prediction_entropy = entropy(softmax_probs_cpu, axis=1)
        confidence_scores = softmax_probs.max(dim=1).values.cpu().numpy()

        # Calculate uncertainty threshold using standard deviation
        uncertainty_threshold = np.mean(prediction_entropy) + np.std(prediction_entropy)

        return {
            "mean_confidence": float(np.mean(confidence_scores)),
            "mean_entropy": float(np.mean(prediction_entropy)),
            "high_uncertainty_samples": int(np.sum(prediction_entropy > uncertainty_threshold)),
            "low_confidence_predictions": int(np.sum(confidence_scores < 0.7)),
            "confidence_std": float(np.std(confidence_scores)),
            "entropy_std": float(np.std(prediction_entropy))
        }

    def measure_faithfulness(self, n_samples=100):
        """Measures model faithfulness through input perturbation tests"""
        test_texts = self.dataset["test"]["text"][:n_samples]
        original_predictions = []
        perturbed_predictions = []
        confidence_changes = []

        for text in test_texts:
            # Original prediction
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            original_logits = output.logits
            original_pred = torch.argmax(original_logits, dim=1).item()
            original_conf = torch.nn.functional.softmax(original_logits, dim=1).max().cpu().item()
            original_predictions.append(original_pred)

            # Create perturbed text
            words = text.split()
            n_remove = max(1, int(len(words) * 0.2))
            remove_indices = np.random.choice(len(words), n_remove, replace=False)
            perturbed_text = " ".join([w for i, w in enumerate(words) if i not in remove_indices])

            # Perturbed prediction
            inputs = self.tokenizer(perturbed_text, return_tensors="pt", truncation=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                output = self.model(**inputs)
            perturbed_logits = output.logits
            perturbed_pred = torch.argmax(perturbed_logits, dim=1).item()
            perturbed_conf = torch.nn.functional.softmax(perturbed_logits, dim=1).max().cpu().item()

            perturbed_predictions.append(perturbed_pred)
            confidence_changes.append(abs(original_conf - perturbed_conf))

        prediction_stability = np.mean(np.array(original_predictions) == np.array(perturbed_predictions))

        return {
            "prediction_stability": float(prediction_stability),
            "stability_score": float(prediction_stability * 100),
            "mean_confidence_change": float(np.mean(confidence_changes)),
            "confidence_change_std": float(np.std(confidence_changes))
        }

    def measure_answer_correctness(self, predictions: np.ndarray, labels: List[int]) -> Dict:
        """Comprehensive evaluation using modern metrics"""
        pred_labels = np.argmax(predictions, axis=1)

        # Calculate metrics using evaluate library
        accuracy = self.accuracy.compute(predictions=pred_labels, references=labels)
        precision = self.precision.compute(predictions=pred_labels, references=labels, average='weighted')
        recall = self.recall.compute(predictions=pred_labels, references=labels, average='weighted')
        f1 = self.f1.compute(predictions=pred_labels, references=labels, average='weighted')

        # Calculate per-class metrics
        class_metrics = {}
        for idx, class_name in self.id2label.items():
            class_predictions = (pred_labels == idx).astype(int)
            class_labels = (np.array(labels) == idx).astype(int)

            class_metrics[class_name] = {
                "precision": float(self.precision.compute(predictions=class_predictions, references=class_labels)['precision']),
                "recall": float(self.recall.compute(predictions=class_predictions, references=class_labels)['recall']),
                "f1": float(self.f1.compute(predictions=class_predictions, references=class_labels)['f1'])
            }

        return {
            "overall_accuracy": accuracy['accuracy'],
            "weighted_precision": precision['precision'],
            "weighted_recall": recall['recall'],
            "weighted_f1": f1['f1'],
            "per_class_metrics": class_metrics
        }

    def evaluate_all(self):
        """Run all evaluation metrics and return comprehensive results"""
        test_texts = self.dataset["test"]["text"]
        test_labels = self.dataset["test"]["label"]

        inputs = self.tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits.cpu().numpy()

        self.metrics["computational_cost"] = self.measure_computational_cost()
        self.metrics["confidence_analysis"] = self.measure_prediction_confidence(predictions)
        self.metrics["faithfulness"] = self.measure_faithfulness()
        self.metrics["correctness"] = self.measure_answer_correctness(predictions, test_labels)

        return self.metrics

# Initialize and run comprehensive evaluation
evaluator = ModernEvaluationMetrics(model, tokenizer, tokenized_dataset, id2label)
evaluation_results = evaluator.evaluate_all()

# Log results to wandb
wandb.log({
    "advanced_metrics": evaluation_results
})

# Print detailed results
print(evaluation_results)

wandb.finish()

[INFO] Creating directory for saving models: models_colab/finbert_sustainability_news_five_epochs
[INFO] Downloading dataset from Hugging Face Hub, name: arad1367/sustainability_impact_crypto_data
[INFO] Tokenizing text for model training with tokenizer: ProsusAI/finbert


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

[INFO] Loading model: ProsusAI/finbert


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

[INFO] Model loading complete!
[INFO] Model tracking started ...


  trainer = MyTrainer(


[INFO] Commencing model training...




Epoch,Training Loss,Validation Loss,Accuracy
1,2.1951,1.718659,0.533333
2,1.3074,1.182755,0.6
3,0.8095,0.626185,0.7
4,0.5042,0.436891,0.9
5,0.4301,0.395119,0.95


[INFO] Model training complete, saving model to local path: models_colab/finbert_sustainability_news_five_epochs
[INFO] Performing evaluation on test dataset...


[INFO] Prediction metrics on the test data:
{'test_accuracy': 0.95,
 'test_loss': 0.39511939883232117,
 'test_runtime': 0.0528,
 'test_samples_per_second': 1135.903,
 'test_steps_per_second': 37.863}
{'computational_cost': {'inference_time_per_sample': 0.0012336894869804382, 'cpu_memory_usage_mb': 0.0, 'gpu_memory_usage_mb': 0.0224609375}, 'confidence_analysis': {'mean_confidence': 0.710712730884552, 'mean_entropy': 0.7141669988632202, 'high_uncertainty_samples': 11, 'low_confidence_predictions': 26, 'confidence_std': 0.1421707719564438, 'entropy_std': 0.19915933907032013}, 'faithfulness': {'prediction_stability': 0.9, 'stability_score': 90.0, 'mean_confidence_change': 0.06709410697221756, 'confidence_change_std': 0.0723764298758498}, 'correctness': {'overall_accuracy': 0.95, 'weighted_precision': 0.959375, 'weighted_recall': 0.95, 'weighted_f1': 0.950919540229885, 'per_class_metrics': {'negative': {'precision': 0.8125, 'recall': 1.0, 'f1': 0.896551724137931}, 'neutral': {'precision': 

0,1
epoch,▁▁▃▃▅▅▆▆███
eval/accuracy,▁▂▄▇█
eval/loss,█▅▂▁▁
eval/runtime,█▅▆▁▇
eval/samples_per_second,▁▅▃█▂
eval/steps_per_second,▁▅▃█▂
eval_accuracy,▁▂▄▇█
eval_loss,█▅▂▁▁
eval_runtime,█▅▆▁▇
eval_samples_per_second,▁▅▃█▂

0,1
epoch,5.0
eval/accuracy,0.95
eval/loss,0.39512
eval/runtime,0.0509
eval/samples_per_second,1178.208
eval/steps_per_second,39.274
eval_accuracy,0.95
eval_loss,0.39512
eval_runtime,0.0509
eval_samples_per_second,1178.208
