In [None]:
%cd ..

In [None]:
from sklearn.metrics import cohen_kappa_score
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm.auto import tqdm
from sklearn.metrics import recall_score, precision_score, f1_score
import nltk
from statsmodels.stats import inter_rater as irr
from src.hatespeech.attack import load_attack
nltk.download('punkt')
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', 200)

## Measure annotation agreement

In [None]:
processed_dir = Path("data") / "processed"
final_dir = Path("data") / "final"
dan_path = processed_dir / "annotated-off-dan.csv"
anders_path = processed_dir / "annotated-off-anders.csv"
oliver_path = processed_dir / "annotated-off-oliver.csv"

In [None]:
dan_df = pd.read_csv(dan_path, sep="\t")
anders_df = pd.read_csv(anders_path, sep="\t")
oliver_df = pd.read_csv(oliver_path, sep="\t")
oliver_df.label = oliver_df.label.map(
    lambda lbl: "Could be offensive, depending on context" if lbl == "Not sure" else lbl
)
oliver_df.head()

In [None]:
labels = np.stack(
    [
        dan_df.label.astype('category').cat.codes, 
        anders_df.label.astype('category').cat.codes,
        oliver_df.label.astype('category').cat.codes,
    ],
    axis=-1
)
labels

In [None]:
agg, _ = irr.aggregate_raters(labels)
irr.fleiss_kappa(agg)

## Visualise annotation agreement

In [None]:
label_df = pd.concat([dan_df[["label"]], anders_df[["label"]], oliver_df[["label"]]], axis=1)
label_df.columns = ["Dan", "Anders", "Oliver"]
label_df.head()

In [None]:
def change_label_names(label: str) -> str:
    if "context" in label:
        return "Missing context"
    else:
        return label
label_df = label_df.applymap(change_label_names)
label_df.head()

In [None]:
comparisons = [
    ["Dan", "Anders"],
    ["Dan", "Oliver"],
    ["Anders", "Oliver"],
    ["Dan", "Anders", "Oliver"],
]
for comparison in comparisons:
    display(label_df.groupby(comparison).size().unstack(fill_value=0))

## Extract dataframe with majority labels

In [None]:
majority_vote_df = dan_df.copy()[["text", "label"]]
majority_vote_df.label = label_df.mode(axis=1).dropna(axis=1)
majority_vote_df = majority_vote_df.reset_index().rename(columns=dict(index="idx"))
majority_vote_df.head()

In [None]:
agreed_df = pd.concat(
    [label_df[col] == majority_vote_df.label for col in label_df.columns], axis=1
)
agreed_df.head()

In [None]:
num_agreed = agreed_df.sum(axis=1)
majority_vote_df["num_agreed"] = num_agreed
majority_vote_df = majority_vote_df.query("label != 'Missing context'")
majority_vote_df

In [None]:
majority_vote_df.num_agreed.value_counts()

In [None]:
val_df_pos = majority_vote_df.query("label == 'Offensive'").sample(frac=0.5, random_state=4242)
val_df_neg = majority_vote_df.query("label == 'Not offensive'").sample(frac=0.5, random_state=4242)
val_df = pd.concat((val_df_pos, val_df_neg), axis=0).sample(frac=1.).reset_index(drop=True)
val_df.label.value_counts()

In [None]:
test_df = majority_vote_df[~majority_vote_df.idx.isin(val_df.idx)]
test_df.label.value_counts()

In [None]:
majority_vote_df.to_parquet(processed_dir / "annotated-off.parquet")
val_df.to_parquet(final_dir / "val-off.parquet")
test_df.to_parquet(final_dir / "test-off.parquet")