In [1]:
from collections import defaultdict
import json

import numpy as np
from statsmodels.stats.inter_rater import fleiss_kappa
from scipy.stats import pearsonr, spearmanr

In [2]:
def load_json(inpath):
    with open(inpath) as infile:
        return json.load(infile)

In [3]:
results = {
    "news": load_json("news.json"),
    "leg": load_json("leg.json"),
    "covid": load_json("covid.json"),
}

In [4]:
likerts = [
    {'dataset': dataset, 'rating_group': who, 'topic': i, 'annotator': j, 'rating': rating}
    for dataset, topics in results.items()
    for i, topic_data in enumerate(topics)
    for who in ['crowdwork_ratings', 'mturk_ratings_data']
    for j, rating in enumerate(topic_data[who] if who == 'crowdwork_ratings' else topic_data[who]['answers'])
]

In [12]:
likerts = pd.DataFrame(likerts)

In [10]:
likerts.groupby(["rating_group"]).agg({"rating": ["mean", "std"]})

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,mean,std
rating_group,Unnamed: 1_level_2,Unnamed: 2_level_2
crowdwork_ratings,2.49,0.640448
mturk_ratings_data,2.346667,0.676069


In [15]:
likerts.rating.max()

3

In [17]:
import pandas as pd
pd.DataFrame(likerts).to_csv("likerts.csv", index=False)

# Within-topic MTurk-Amazon agreement

In [5]:
scores = {
    ds: {
        "internal_npmi": [topic['internal_npmi'] for topic in ds_topics],
        "crowdwork_ratings": [np.mean(topic["crowdwork_ratings"]) for topic in ds_topics],
        "mturk_ratings": [np.mean(topic["mturk_ratings_data"]["answers"]) for topic in ds_topics],
        "cw": [np.array(topic["crowdwork_ratings"]) for topic in ds_topics],
        "mturk": [np.array(topic["mturk_ratings_data"]["answers"]) for topic in ds_topics],
    }
    for ds, ds_topics in results.items()
}

In [6]:

n_runs = 20
for ds, ratings in scores.items():
    print()
    p_r, p_p = pearsonr(ratings['crowdwork_ratings'], ratings['mturk_ratings'])

    p_sig = (p_p < 0.01) * "*"
    print(f"{ds:5}: Pearson: {p_r:0.3f}{p_sig}")

    # bootstraped internal cor:
    bootstrapped = defaultdict(list)
    for run in range(n_runs):
        cw1_boot = np.array([np.random.choice(topic, size=7) for topic in ratings["cw"]])
        cw2_boot = np.array([np.random.choice(topic, size=7) for topic in ratings["cw"]])
        mt1_boot = np.array([np.random.choice(topic, size=7) for topic in ratings["mturk"]])
        mt2_boot = np.array([np.random.choice(topic, size=7) for topic in ratings["mturk"]])

        bootstrapped["cw internal"].append(pearsonr(cw1_boot.mean(1), cw2_boot.mean(1))[0])
        bootstrapped["mt internal"].append(pearsonr(mt1_boot.mean(1), mt2_boot.mean(1))[0])
        bootstrapped["mt cw"].append(pearsonr(cw1_boot.mean(1), mt1_boot.mean(1))[0])

    for k, v in bootstrapped.items():
        print(f"{k:11}: {np.mean(v):0.3f} ({np.std(v):0.3f})")



news : Pearson: 0.833*
cw internal: 0.860 (0.055)
mt internal: 0.662 (0.106)
mt cw      : 0.651 (0.141)

leg  : Pearson: 0.835*
cw internal: 0.671 (0.086)
mt internal: 0.504 (0.112)
mt cw      : 0.498 (0.109)

covid: Pearson: 0.789*
cw internal: 0.708 (0.067)
mt internal: 0.498 (0.064)
mt cw      : 0.491 (0.116)


## Fleiss' kappa

In [53]:
for row in

(array([2, 2, 1, 3, 2, 2, 1]), array([2, 4, 1]))

In [72]:
def to_fleiss(ratings, min_raters=15):
    # subjects by rater -> subjects by category
    # basically a pivot
    return np.array([np.bincount(row[:min_raters]-1, minlength=3) for row in ratings])

for ds, ratings in scores.items():
    fleiss_cw = fleiss_kappa(to_fleiss(ratings["cw"]))
    fleiss_mt = fleiss_kappa(to_fleiss(ratings["mturk"]))
    
    combined = [
        np.concatenate([
            np.random.choice(row_i, size=7),
            np.random.choice(row_j, size=7)
        ])
        for row_i, row_j in zip(ratings["cw"], ratings["mturk"])
    ]
    fleiss_combined = fleiss_kappa(to_fleiss(combined))

    print(f"{ds:5}, fleiss CW: {fleiss_cw:0.4f}, fleiss MTurk: {fleiss_mt:0.4f}, combin: {fleiss_combined:0.4f}")

news , fleiss CW: 0.2189, fleiss MTurk: 0.1014, combin: 0.1532
leg  , fleiss CW: 0.1133, fleiss MTurk: 0.0665, combin: 0.1347
covid, fleiss CW: 0.1838, fleiss MTurk: 0.0390, combin: 0.1531


## Correlations

In [7]:
n_runs = 20
for ds, ratings in scores.items():
    print()
    for type in ["crowdwork", "mturk"]:
        p_r, p_p = pearsonr(ratings[f'{type}_ratings'], ratings['internal_npmi'])
        s_r, s_p = spearmanr(ratings[f'{type}_ratings'], ratings['internal_npmi'])

        p_sig = (p_p < 0.01) * "*"
        s_sig = (s_p < 0.01) * "*"
        print(f"{ds:5} {type:9} Pearson: {p_r:0.3f}{p_sig} Spearman: {s_r:0.3f}{s_sig}")


news  crowdwork Pearson: 0.601* Spearman: 0.585*
news  mturk     Pearson: 0.584* Spearman: 0.555

leg   crowdwork Pearson: 0.337 Spearman: 0.410*
leg   mturk     Pearson: 0.270 Spearman: 0.243

covid crowdwork Pearson: -0.163 Spearman: 0.011
covid mturk     Pearson: -0.054 Spearman: -0.004


In [None]:
python run_mlm.py \
--model_name_or_path roberta-base \
--train_file data/20ng/train.txt \
--validation_file data/20ng/test.txt \
--line_by_line \
--do_train \
--do_eval \
--num_train_epochs 10 \
--save_steps 1000 \
--output_dir ./models/20ng

python run_mlm.py \
--model_name_or_path bert-base-uncased \
--train_file data/20ng/train.txt \
--validation_file data/20ng/test.txt \
--line_by_line \
--wwm \
--do_train \
--do_eval \
--num_train_epochs 10 \
--save_steps 1000 \
--output_dir /workspace/transformers/examples/language-modeling/models/20ng-bert-base-uncased-wwm


python run_mlm.py \
--model_name_or_path bert-base-uncased \
--train_file data/20ng/train.txt \
--validation_file data/20ng/test.txt \
--line_by_line \
--do_train \
--do_eval \
--mlm_prob 0.1 \
--random_word_masking_only \
--num_train_epochs 10 \
--save_steps 1000 \
--output_dir ./models/20ng-random-word-only