## CSC-696-001.2025F Final Project(2/3)
**Name: Anna Hyunjung Kim**

**Collaborators: Prof. Patrick Wu**





---





**Title:** Measuring Ethical Risks in AI-Generated News Using NLP with the UNESCO Ethics of AI Framework

**Research Question:** How many problematic errors occur ethically in news articles generated by AI to some extent. Also, which category of the AI ethics principles proposed by UNESCO do these issues correspond closest to?

In [None]:
import json
import requests
from collections import Counter
import pandas as pd
from datasets import Dataset, DatasetDict

BASE_URL = "https://raw.githubusercontent.com/punyajoy/HateXplain/master/Data/"

dataset_json = requests.get(BASE_URL + "dataset.json").json()
split_ids    = requests.get(BASE_URL + "post_id_divisions.json").json()


In [None]:

id2label_str = {0: "hatespeech", 1: "normal", 2: "offensive"}
label_str2id = {"hatespeech": 0, "normal": 1, "offensive": 2}

def normalize_label(lab):
    if isinstance(lab, int):
        return id2label_str[lab]
    return lab

def build_split(split_key):
    rows = []
    for pid in split_ids[split_key]:
        info = dataset_json[pid]
        tokens = info["post_tokens"]
        text   = " ".join(tokens)


        raw_labels = [ann["label"] for ann in info["annotators"]]
        labels_norm = [normalize_label(l) for l in raw_labels]
        maj_label_str = Counter(labels_norm).most_common(1)[0][0]
        maj_label_id  = label_str2id[maj_label_str]

        rows.append({
            "id": pid,
            "text": text,
            "label": maj_label_id,
        })

    df = pd.DataFrame(rows)
    return Dataset.from_pandas(df, preserve_index=False)

train_ds = build_split("train")
val_ds   = build_split("val")
test_ds  = build_split("test")

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds,
})

print(dataset)
print(dataset["train"][0])
# 0: "hatespeech", 1: "normal", 2: "offensive"

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 15383
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1924
    })
})
{'id': '23107796_gab', 'text': 'u really think i would not have been raped by feral hindu or muslim back in india or bangladesh and a neo nazi would rape me as well just to see me cry', 'label': 2}


In [None]:
#!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate
import numpy as np

In [None]:
# 0: "hatespeech", 1: "normal", 2: "offensive"
id2label = {0: "hatespeech", 1: "normal", 2: "offensive"}
label2id = {v: k for k, v in id2label.items()}
print(id2label)


{0: 'hatespeech', 1: 'normal', 2: 'offensive'}


In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

tokenized_dataset = dataset.map(tokenize_batch, batched=True)

cols_to_remove = [c for c in tokenized_dataset["train"].column_names
                  if c not in ["input_ids", "attention_mask", "label"]]

tokenized_dataset = tokenized_dataset.remove_columns(cols_to_remove)

tokenized_dataset.set_format("torch")

tokenized_dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15383 [00:00<?, ? examples/s]

Map:   0%|          | 0/1922 [00:00<?, ? examples/s]

Map:   0%|          | 0/1924 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 15383
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1922
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1924
    })
})

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc  = accuracy_score(labels, preds)
    f1   = f1_score(labels, preds, average="macro")
    prec = precision_score(labels, preds, average="macro")
    rec  = recall_score(labels, preds, average="macro")

    return {
        "accuracy": acc,
        "f1_macro": f1,
        "precision_macro": prec,
        "recall_macro": rec,
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./hatexplain_distilbert",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none",
    label_smoothing_factor=0.1
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

# validation
eval_results = trainer.evaluate()
print(eval_results)

# test
test_results = trainer.evaluate(tokenized_dataset["test"])
print(test_results)


Step,Training Loss
500,0.8913
1000,0.8075
1500,0.7385
2000,0.7221
2500,0.6495


{'eval_loss': 0.8223803043365479, 'eval_accuracy': 0.6945889698231009, 'eval_f1_macro': 0.6818566126545894, 'eval_precision_macro': 0.6831365476743532, 'eval_recall_macro': 0.683054591797116, 'eval_runtime': 6.5496, 'eval_samples_per_second': 293.453, 'eval_steps_per_second': 9.314, 'epoch': 3.0}
{'eval_loss': 0.8156391978263855, 'eval_accuracy': 0.6923076923076923, 'eval_f1_macro': 0.6762298186866601, 'eval_precision_macro': 0.6757851449322296, 'eval_recall_macro': 0.6792344994720835, 'eval_runtime': 6.7088, 'eval_samples_per_second': 286.786, 'eval_steps_per_second': 9.093, 'epoch': 3.0}


In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

save_dir = "/content/drive/MyDrive/models/hatexplain_distilbert"

os.makedirs(save_dir, exist_ok=True)

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

Mounted at /content/drive


('/content/drive/MyDrive/models/hatexplain_distilbert/tokenizer_config.json',
 '/content/drive/MyDrive/models/hatexplain_distilbert/special_tokens_map.json',
 '/content/drive/MyDrive/models/hatexplain_distilbert/vocab.txt',
 '/content/drive/MyDrive/models/hatexplain_distilbert/added_tokens.json',
 '/content/drive/MyDrive/models/hatexplain_distilbert/tokenizer.json')