In [None]:
%cd ..

In [None]:
from sklearn.metrics import cohen_kappa_score
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm.auto import tqdm
from sklearn.metrics import recall_score, precision_score, f1_score
import nltk
nltk.download('punkt')
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', 200)

## Measure annotation agreement

In [None]:
processed_dir = Path("data") / "processed"
final_dir = Path("data") / "final"
dan_path = processed_dir / "dr_offensive_annotated_dan.csv"
anders_path = processed_dir / "dr_offensive_annotated_anders.csv"

In [None]:
dan_df = pd.read_csv(dan_path, sep="\t")
anders_df = pd.read_csv(anders_path, sep="\t")
dan_df.head()

In [None]:
cohen_kappa_score(dan_df.label, anders_df.label)

## Visualise annotation agreement

In [None]:
label_df = pd.concat([dan_df[["label"]], anders_df[["label"]]], axis=1)
label_df.columns = ["Dan", "Anders"]
label_df.head()

In [None]:
def change_label_names(label: str) -> str:
    if "context" in label:
        return "Missing context"
    else:
        return label
label_df = label_df.applymap(change_label_names)
label_df.head()

In [None]:
label_df.groupby(["Dan", "Anders"]).size().unstack(fill_value=0)

## Extract dataframe with agreed labels

In [None]:
indices_with_agreement = label_df.query('Dan == Anders and Dan != "Missing context"').index.tolist()
agreement_df = dan_df.loc[indices_with_agreement][["text", "label"]].reset_index().rename(columns=dict(index="idx"))
agreement_df.head()

In [None]:
agreement_df.label.value_counts()

In [None]:
val_df_pos = agreement_df.query("label == 'Offensive'").sample(frac=0.5, random_state=4242)
val_df_neg = agreement_df.query("label == 'Not offensive'").sample(frac=0.5, random_state=4242)
val_df = pd.concat((val_df_pos, val_df_neg), axis=0).sample(frac=1.).reset_index(drop=True)
val_df.label.value_counts()

In [None]:
test_df = agreement_df[~agreement_df.idx.isin(val_df.idx)]
test_df.label.value_counts()

In [None]:
agreement_df.to_parquet(processed_dir / "dr_offensive_annotated_agreement.parquet")
val_df.to_parquet(final_dir / "dr_offensive_val.parquet")
test_df.to_parquet(final_dir / "dr_offensive_test.parquet")

## Evaluate models on the agreed labels

In [None]:
val_df = pd.read_parquet("data/final/val-off.parquet")
test_df = pd.read_parquet("data/final/test-off.parquet")
val_df.head()

In [None]:
def get_logits(text: str, tok, model) -> torch.Tensor:
    if tok.model_max_length > 100_000:
        tok.model_max_length = 512
    toks = tok(text, return_tensors='pt', truncation=True, padding=True)
    logits = model(**toks)[0]
    return logits[0][-1]

In [None]:
val_labels = [1 if lbl == "Offensive" else 0 for lbl in val_df.label]
test_labels = [1 if lbl == "Offensive" else 0 for lbl in test_df.label]

In [None]:
models = [
    ("our old XLMR-base model", 'models/xlmr-base1'),
    ("our new XLMR-base model", 'models/xlmr-base2'),
    ("our XLMR-large model", 'models/xlmr-large'),
    ("our AELAECTRA model", 'models/aelaectra'),
    ("Guscode", 'Guscode/DKbert-hatespeech-detection'),
    ("DaNLP BERT", 'DaNLP/da-bert-hatespeech-classification'),
    ("DaNLP ELECTRA", 'DaNLP/da-electra-hatespeech-detection'),
]

with tqdm(models) as pbar:
    for name, model_id in pbar:
        
        # Update progress bar description
        pbar.set_description(f"Evaluating {name}")

        # Load tokenizer and model
        tok = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSequenceClassification.from_pretrained(model_id)

        # Get predictions
        preds = torch.stack(
            [get_logits(doc, tok, model) for doc in tqdm(test_df.text, leave=False)]
        ) > 0

        # Compute scores
        recall = recall_score(test_labels, preds)
        precision = precision_score(test_labels, preds)
        f1 = f1_score(test_labels, preds)
        macro_f1 = f1_score(test_labels, preds, average='macro')

        # Print scores
        print(f'Scores for {name}:')
        print(f'\tRecall: {100 * recall:.2f}%')
        print(f'\tPrecision: {100 * precision:.2f}%')
        print(f'\tF1-score: {100 * f1:.2f}%')
        
    pbar.set_description("Evaluating")

## Error analysis

In [None]:
# Load tokenizer and model
model_id = 'models/xlmr-base1'
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

# Get logits
logits = torch.stack(
    [get_logits(doc, tok, model) for doc in tqdm(val_df.text, leave=False)]
)

In [None]:
# Add the logits and equivalent probabilities to the validation dataframe
val_df["model_logits"] = logits.tolist()
val_df["model_probs"] = torch.sigmoid(logits).tolist()
val_df.head()

In [None]:
# Get the sample indices on which the model was wrong
wrong_idxs = (
    torch.nonzero((logits > 0) != torch.tensor(val_labels)).squeeze(1).tolist()
)

# Get the samples on which the model was wrong
wrong_df = val_df.loc[wrong_idxs]

# Sort the dataframe by absolute value of logits
wrong_df = wrong_df.sort_values(by='model_logits', key=lambda x: abs(x), ascending=False)

wrong_df