In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from crowdkit.aggregation import Wawa
from sklearn.model_selection import train_test_split

In [6]:
def get_mv_or_median(x):
    vc = x.value_counts()
    if vc.values[0] == 1:
        return int(np.median(x))
    else:
        return int(vc.index[0])

In [62]:
def get_wawa_score(scores):
    return Wawa().fit_predict(scores)

In [8]:
def print_unaligned_workers(scores_path, threshold=0.35):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    wawa_score = get_wawa_score(toloka_markup)

    worker_ids = toloka_markup.value_counts("ASSIGNMENT:worker_id").index
    worker_f1s = []
    for worker_id in worker_ids:
        worker_scores = toloka_markup[toloka_markup['ASSIGNMENT:worker_id'] == worker_id][['ASSIGNMENT:task_id', 'OUTPUT:is_subjective']]
        worker_scores = worker_scores.set_index('ASSIGNMENT:task_id')
        worker_scores['wawa'] = wawa_score[wawa_score.index.isin(worker_scores.index)]
        worker_f1s.append(f1_score(worker_scores['wawa'], worker_scores['OUTPUT:is_subjective'], average='macro'))
    wawa_worker_ratings = pd.Series(data=worker_f1s, index=worker_ids)
    return wawa_worker_ratings[wawa_worker_ratings < 0.35]

In [9]:
def print_label_distribution(scores_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    wawa_score = get_wawa_score(toloka_markup)

    vc = wawa_score.value_counts()
    print(f"Sentences with the 'Non-Applicable' label: {vc[0]}")
    print(f"Sentences with the 'Objective' label: {vc[1]}")
    print(f"Sentences with the 'Neutral' label: {vc[2]}")
    print(f"Sentences with the 'Subjective' label: {vc[3]}")

In [32]:
def aggregate_scores(scores_path, sentence_id_retrieval_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    sentence_ids_by_text = pd.read_csv(sentence_id_retrieval_path, sep='\t').set_index("INPUT:text")

    wawa_score = get_wawa_score(toloka_markup)
    
    wawa_score_df = pd.DataFrame(wawa_score).rename(columns={"agg_label": "score"})
    wawa_score_df = wawa_score_df.join(toloka_markup[["INPUT:text", "ASSIGNMENT:task_id"]].drop_duplicates().set_index("ASSIGNMENT:task_id"))
    wawa_score_df["INPUT:text"] = wawa_score_df["INPUT:text"].str.replace("\r", "")
    wawa_score_df = wawa_score_df.set_index("INPUT:text").join(sentence_ids_by_text).reset_index().drop("INPUT:text", axis=1)
    wawa_score_df = wawa_score_df.groupby(by="sentence_id")['score'].apply(list).reset_index().explode('score')

    print(wawa_score_df.isna().value_counts()[False].item())
    assert wawa_score_df.isna().value_counts()[False] == len(wawa_score_df)

    return wawa_score_df

In [65]:
def map_task_id_into_sentence_id(scores_path, sentence_id_retrieval_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t')[['INPUT:text', 'OUTPUT:is_subjective', 'ASSIGNMENT:worker_id']]
    toloka_markup["INPUT:text"] = toloka_markup["INPUT:text"].str.replace("\r", "")
    toloka_markup = toloka_markup.rename(columns={
        'INPUT:text': 'text',
        'OUTPUT:is_subjective': 'label',
        'ASSIGNMENT:worker_id' : 'worker'
    }).set_index('text')
    toloka_markup['label'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)
    sentence_ids_by_text = pd.read_csv(sentence_id_retrieval_path, sep='\t').set_index("INPUT:text")
    return toloka_markup.join(sentence_ids_by_text).reset_index(drop=True).rename(columns={'sentence_id': 'task'})

In [117]:
def join_with_data_and_split(aggregated_scores, data_path):
    data = aggregated_scores.join(pd.read_csv(data_path).set_index('sentence_id')).reset_index(drop=False).rename(columns={'index': 'sentence_id'})
    print(data)
    train_val, test = train_test_split(data, test_size=0.2, stratify=data['score'])
    train, val = train_test_split(train_val, test_size=0.1, stratify=train_val['score'])
    return train, val, test

In [19]:
print_unaligned_workers("testing_scores.tsv")

Series([], dtype: float64)

In [20]:
print_unaligned_workers("full_scores.tsv")

Series([], dtype: float64)

In [21]:
print_label_distribution("testing_scores.tsv")

Sentences with the 'Non-Applicable' label: 15
Sentences with the 'Objective' label: 52
Sentences with the 'Neutral' label: 1
Sentences with the 'Subjective' label: 89


In [22]:
print_label_distribution("full_scores.tsv")

Sentences with the 'Non-Applicable' label: 1236
Sentences with the 'Objective' label: 14222
Sentences with the 'Neutral' label: 388
Sentences with the 'Subjective' label: 3861


In [110]:
sample_scores = pd.read_csv("sample_scores.csv").replace({2: 1, 3: 2, 4: 3, 5: 3}).rename(columns={
    'sentence_id': 'task',
    'score': 'label'
})
sample_scores['worker'] = [1000] * len(sample_scores)
testing_scores = map_task_id_into_sentence_id("testing_scores.tsv", "testing_dataset_for_retrieval.tsv")
full_scores = map_task_id_into_sentence_id("full_scores.tsv", "full_dataset_for_retrieval.tsv")
scores = pd.concat([sample_scores, testing_scores, full_scores]).set_index('task')
scores = scores.join(pd.read_csv("sent_tokenized_dataset.csv").set_index('sentence_id')).reset_index(drop=False)
labels = scores.groupby("text")['label'].apply(list)
workers = scores.groupby("text")['worker'].apply(list)
labels_and_workers = list(map(lambda x : list(zip(x[0], x[1])), zip(labels, workers)))
tasks = scores.groupby("text")['index'].apply(lambda x : x.values[0]).to_frame().rename(columns={'index': 'task'})
tasks['labels_and_workers'] = labels_and_workers
tasks = tasks.explode('labels_and_workers')
tasks['label'] = tasks['labels_and_workers'].apply(lambda x : x[0])
tasks['worker'] = tasks['labels_and_workers'].apply(lambda x : x[1])
tasks = tasks.drop('labels_and_workers', axis=1).reset_index(drop=False)

In [112]:
tasks.sort_values('task')

Unnamed: 0,text,task,label,worker
49523,"The leaders of Greece, Bulgaria, Romania and S...",0.0,1,1000
50831,The prospects of the Western Balkans joining t...,1.0,1,1000
50832,The prospects of the Western Balkans joining t...,1.0,1,1000
38269,"Prime ministers Alexis Tsipras of Greece, Boyk...",2.0,1,1000
50974,The refugee crisis is also expected to be a to...,3.0,1,1000
...,...,...,...,...
18773,EPA-EFE/Pepo Herrera\nCameroonian-born French ...,19951.0,1,1bf730b89d0a37c9bcf743dfa102f1e3
18774,EPA-EFE/Pepo Herrera\nCameroonian-born French ...,19951.0,1,450fbacd15dba70760ea02ead77757c0
44055,The 25-year-old has signed a four-season deal ...,19952.0,1,450fbacd15dba70760ea02ead77757c0
44053,The 25-year-old has signed a four-season deal ...,19952.0,1,78c7a03ba686f83f7ee501772d05b75d


In [115]:
aggregated_scores = get_wawa_score(tasks).to_frame().rename(columns={"agg_label": "score"})
aggregated_scores.value_counts()

score
1        13309
3         3714
0          953
2          358
dtype: int64

In [118]:
train, val, test = join_with_data_and_split(aggregated_scores, 'sent_tokenized_dataset.csv')
train.to_csv("../datasets/nsdc/train.csv", index=False)
val.to_csv("../datasets/nsdc/val.csv", index=False)
test.to_csv("../datasets/nsdc/test.csv", index=False)

          task  score                domain                 date  \
0          0.0      1  www.ekathimerini.com  2018-07-25 00:00:00   
1          1.0      1  www.ekathimerini.com  2018-07-25 00:00:00   
2          2.0      1  www.ekathimerini.com  2018-07-25 00:00:00   
3          3.0      1  www.ekathimerini.com  2018-07-25 00:00:00   
4          6.0      1  www.ekathimerini.com  2018-07-25 00:00:00   
...        ...    ...                   ...                  ...   
18329  19947.0      1           www.efe.com  2018-07-04 22:46:37   
18330  19948.0      1           www.efe.com  2018-07-04 22:46:37   
18331  19949.0      1           www.efe.com  2018-07-04 22:46:37   
18332  19951.0      1           www.efe.com  2018-07-04 22:46:37   
18333  19952.0      1           www.efe.com  2018-07-04 22:46:37   

                                                    text  paper_id  
0      The leaders of Greece, Bulgaria, Romania and S...    421969  
1      The prospects of the Western Balkans j

In [108]:
data = pd.concat([train, val, test])
assert len(data.drop_duplicates(['text']) == len(data))

In [121]:
test.isna().value_counts()

task   score  domain  date   text   paper_id
False  False  False   False  False  False       3667
dtype: int64