In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from crowdkit.aggregation import Wawa
from sklearn.model_selection import train_test_split

In [29]:
def get_mv_or_median(x):
    vc = x.value_counts()
    if vc.values[0] == 1:
        return int(np.median(x))
    else:
        return int(vc.index[0])

In [30]:
def get_wawa_score(toloka_markup):
    for_wawa = toloka_markup[['OUTPUT:is_subjective', 'ASSIGNMENT:task_id', 'ASSIGNMENT:worker_id']].rename({
        'OUTPUT:is_subjective': 'label',
        'ASSIGNMENT:task_id': 'task',
        'ASSIGNMENT:worker_id': 'worker'
    }, axis=1)
    return Wawa().fit_predict(for_wawa)

In [31]:
def print_unaligned_workers(scores_path, threshold=0.35):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    wawa_score = get_wawa_score(toloka_markup)

    worker_ids = toloka_markup.value_counts("ASSIGNMENT:worker_id").index
    worker_f1s = []
    for worker_id in worker_ids:
        worker_scores = toloka_markup[toloka_markup['ASSIGNMENT:worker_id'] == worker_id][['ASSIGNMENT:task_id', 'OUTPUT:is_subjective']]
        worker_scores = worker_scores.set_index('ASSIGNMENT:task_id')
        worker_scores['wawa'] = wawa_score[wawa_score.index.isin(worker_scores.index)]
        worker_f1s.append(f1_score(worker_scores['wawa'], worker_scores['OUTPUT:is_subjective'], average='macro'))
    wawa_worker_ratings = pd.Series(data=worker_f1s, index=worker_ids)
    return wawa_worker_ratings[wawa_worker_ratings < 0.35]

In [32]:
def print_label_distribution(scores_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    wawa_score = get_wawa_score(toloka_markup)

    vc = wawa_score.value_counts()
    print(f"Sentences with the 'Non-Applicable' label: {vc[0]}")
    print(f"Sentences with the 'Objective' label: {vc[1]}")
    print(f"Sentences with the 'Neutral' label: {vc[2]}")
    print(f"Sentences with the 'Subjective' label: {vc[3]}")

In [33]:
def get_score_by_sentence_id(scores_path, sentence_id_retrieval_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    sentence_ids_by_text = pd.read_csv(sentence_id_retrieval_path, sep='\t').set_index("INPUT:text")

    wawa_score = get_wawa_score(toloka_markup)
    
    wawa_score_df = pd.DataFrame(wawa_score).rename(columns={"agg_label": "score"})
    wawa_score_df = wawa_score_df.join(toloka_markup[["INPUT:text", "ASSIGNMENT:task_id"]].drop_duplicates().set_index("ASSIGNMENT:task_id"))
    wawa_score_df["INPUT:text"] = wawa_score_df["INPUT:text"].str.replace("\r", "")
    wawa_score_df = wawa_score_df.set_index("INPUT:text").join(sentence_ids_by_text).reset_index().drop("INPUT:text", axis=1)
    wawa_score_df = wawa_score_df.groupby(by="sentence_id")['score'].apply(get_mv_or_median).to_frame()

    assert wawa_score_df.isna().value_counts()[False] == len(wawa_score_df)

    return wawa_score_df

In [34]:
def join_with_data_and_split(aggregated_scores, data_path):
    data = pd.read_csv(data_path).join(aggregated_scores, on="sentence_id").rename(columns={"score":"is_subjective"})
    train_val, test = train_test_split(data, test_size=0.2, stratify=data['is_subjective'])
    train, val = train_test_split(train_val, test_size=0.1, stratify=train_val['is_subjective'])
    return train, val, test

In [156]:
print_unaligned_workers("testing_scores.tsv")

Series([], dtype: float64)

In [157]:
print_unaligned_workers("full_scores.tsv")

Series([], dtype: float64)

In [159]:
print_label_distribution("testing_scores.tsv")

Sentences with the 'Non-Applicable' label: 15
Sentences with the 'Objective' label: 52
Sentences with the 'Neutral' label: 1
Sentences with the 'Subjective' label: 89


In [160]:
print_label_distribution("full_scores.tsv")

Sentences with the 'Non-Applicable' label: 1236
Sentences with the 'Objective' label: 14222
Sentences with the 'Neutral' label: 388
Sentences with the 'Subjective' label: 3861


In [35]:
sample_scores = pd.read_csv("sample_scores.csv").set_index("sentence_id").replace({2: 1, 3: 2, 4: 3, 5: 3})
testing_aggregated_scores = get_score_by_sentence_id("testing_scores.tsv", "testing_dataset_for_retrieval.tsv")
full_aggregated_scores = get_score_by_sentence_id("full_scores.tsv", "full_dataset_for_retrieval.tsv")

aggregated_scores = pd.concat([sample_scores, testing_aggregated_scores, full_aggregated_scores])
aggregated_scores

Unnamed: 0_level_0,score
sentence_id,Unnamed: 1_level_1
0.0,1
1.0,1
2.0,1
3.0,1
4.0,1
...,...
19948.0,1
19949.0,1
19950.0,1
19951.0,1


In [36]:
train, val, test = join_with_data_and_split(aggregated_scores, "sent_tokenized_dataset.csv")

train.to_csv("train.csv", index=False)
val.to_csv("val.csv", index=False)
test.to_csv("test.csv", index=False)

In [37]:
train = pd.read_csv("train.csv")
train['is_subjective'].value_counts()

1    10263
3     2819
0      911
2      372
Name: is_subjective, dtype: int64