In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from crowdkit.aggregation import Wawa
from sklearn.model_selection import train_test_split

In [153]:
def get_mv_or_median(x):
    vc = x.value_counts()
    if vc.values[0] == 1:
        return int(np.median(x))
    else:
        return int(vc.index[0])

In [154]:
def get_wawa_score(toloka_markup):
    for_wawa = toloka_markup[['OUTPUT:is_subjective', 'ASSIGNMENT:task_id', 'ASSIGNMENT:worker_id']].rename({
        'OUTPUT:is_subjective': 'label',
        'ASSIGNMENT:task_id': 'task',
        'ASSIGNMENT:worker_id': 'worker'
    }, axis=1)
    return Wawa().fit_predict(for_wawa)

In [155]:
def print_unaligned_workers(scores_path, threshold=0.35):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    wawa_score = get_wawa_score(toloka_markup)

    worker_ids = toloka_markup.value_counts("ASSIGNMENT:worker_id").index
    worker_f1s = []
    for worker_id in worker_ids:
        worker_scores = toloka_markup[toloka_markup['ASSIGNMENT:worker_id'] == worker_id][['ASSIGNMENT:task_id', 'OUTPUT:is_subjective']]
        worker_scores = worker_scores.set_index('ASSIGNMENT:task_id')
        worker_scores['wawa'] = wawa_score[wawa_score.index.isin(worker_scores.index)]
        worker_f1s.append(f1_score(worker_scores['wawa'], worker_scores['OUTPUT:is_subjective'], average='macro'))
    wawa_worker_ratings = pd.Series(data=worker_f1s, index=worker_ids)
    return wawa_worker_ratings[wawa_worker_ratings < 0.35]

In [158]:
def print_label_distribution(scores_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    wawa_score = get_wawa_score(toloka_markup)

    vc = wawa_score.value_counts()
    print(f"Sentences with the 'Non-Applicable' label: {vc[0]}")
    print(f"Sentences with the 'Objective' label: {vc[1]}")
    print(f"Sentences with the 'Neutral' label: {vc[2]}")
    print(f"Sentences with the 'Subjective' label: {vc[3]}")

In [203]:
def get_score_by_sentence_id(scores_path, sentence_id_retrieval_path):
    toloka_markup = pd.read_csv(scores_path, sep='\t').drop(["GOLDEN:is_subjective", "HINT:text", "HINT:default_language"], axis=1)
    toloka_markup['OUTPUT:is_subjective'].replace({2: 1, 3: 2, 4: 3, 5: 3}, inplace=True)

    sentence_ids_by_text = pd.read_csv(sentence_id_retrieval_path, sep='\t').set_index("INPUT:text")

    wawa_score = get_wawa_score(toloka_markup)
    
    wawa_score_df = pd.DataFrame(wawa_score).rename(columns={"agg_label": "score"})
    wawa_score_df = wawa_score_df.join(toloka_markup[["INPUT:text", "ASSIGNMENT:task_id"]].drop_duplicates().set_index("ASSIGNMENT:task_id"))
    wawa_score_df["INPUT:text"] = wawa_score_df["INPUT:text"].str.replace("\r", "")
    wawa_score_df = wawa_score_df.set_index("INPUT:text").join(sentence_ids_by_text).reset_index().drop("INPUT:text", axis=1)
    wawa_score_df = wawa_score_df.groupby(by="sentence_id")['score'].apply(get_mv_or_median).to_frame()

    print(wawa_score_df.isna().value_counts())

    assert wawa_score_df.isna().value_counts()[False] == len(wawa_score_df)

    return wawa_score_df

In [None]:
def join_with_data_and_split(aggregated_scores, data_path):
    data = pd.read_csv(data_path).join(aggregated_scores, on="sentence_id").rename(columns={"score":"is_subjective"})
    train_val, test = train_test_split(data, test_size=0.2, stratify=data['is_subjective'])
    train, val = train_test_split(train_val, test_size=0.1, stratify=train_val['is_subjective'])


In [156]:
print_unaligned_workers("testing_scores.tsv")

Series([], dtype: float64)

In [157]:
print_unaligned_workers("full_scores.tsv")

Series([], dtype: float64)

In [159]:
print_label_distribution("testing_scores.tsv")

Sentences with the 'Non-Applicable' label: 15
Sentences with the 'Objective' label: 52
Sentences with the 'Neutral' label: 1
Sentences with the 'Subjective' label: 89


In [160]:
print_label_distribution("full_scores.tsv")

Sentences with the 'Non-Applicable' label: 1236
Sentences with the 'Objective' label: 14222
Sentences with the 'Neutral' label: 388
Sentences with the 'Subjective' label: 3861


In [208]:
sample_scores = pd.read_csv("sample_scores.csv").set_index("sentence_id")
testing_aggregated_scores = get_score_by_sentence_id("testing_scores.tsv", "testing_dataset_for_retrieval.tsv")
full_aggregated_scores = get_score_by_sentence_id("full_scores.tsv", "full_dataset_for_retrieval.tsv")

aggregated_scores = pd.concat([sample_scores, testing_aggregated_scores, full_aggregated_scores])
aggregated_scores

score
False    157
dtype: int64
score
False    19707
dtype: int64


Unnamed: 0_level_0,score
sentence_id,Unnamed: 1_level_1
0.0,1
1.0,2
2.0,2
3.0,2
4.0,2
...,...
19948.0,1
19949.0,1
19950.0,1
19951.0,1


In [207]:
full_aggregated_scores

Unnamed: 0_level_0,score
sentence_id,Unnamed: 1_level_1
246.0,3
247.0,3
248.0,2
249.0,3
250.0,3
...,...
19948.0,1
19949.0,1
19950.0,1
19951.0,1


In [150]:
sentence_ids_by_text = pd.read_csv("testing_dataset_for_retrieval.tsv", sep='\t').set_index("INPUT:text")

wawa_score_df = pd.DataFrame(wawa_score)
wawa_score_df = wawa_score_df.join(toloka_markup[["INPUT:text", "ASSIGNMENT:task_id"]].drop_duplicates().set_index("ASSIGNMENT:task_id"))
wawa_score_df["INPUT:text"] = wawa_score_df["INPUT:text"].str.replace("\r", "")
wawa_score_df = wawa_score_df.set_index("INPUT:text").join(sentence_ids_by_text).reset_index().drop("INPUT:text", axis=1)
wawa_score_df.groupby(by="sentence_id")['agg_label'].apply(agg).to_frame().to_csv("testing_aggregated_scores.csv")


In [138]:
wawa_score_df.isna().value_counts()

agg_label  sentence_id
False      False          20795
dtype: int64

In [19]:
vc[1] + vc[2]

NameError: name 'vc' is not defined

In [104]:
sentence_ids_by_text


Unnamed: 0_level_0,sentence_id
INPUT:text,Unnamed: 1_level_1
"**Illegal Immigrants, Palestinian Refugees, and Their Liberal Cheerleaders: The Startling Similarities\nThe Democrat, left-wing media, and progressive activists’ current freak-out over what is at least a decade-long crisis on our southern border is not just hypocritical.** Yes, these same individuals taking to the streets in ugly mobs and threatening Trump administration officials and their families were silent when children entering the country illegally were detained under Obama. In fact, in 2014, after visiting a detention center housing migrant children, Nancy Pelosi asserted that the issue should not be politicized, clearly fearing the poor optics. But the progressive reaction to the migrant issue reflects much more than liberal hypocrisy. It mirrors the liberal response to, and furthering of, inflammatory and intentionally misleading narratives regarding Palestinian refugees.",246.0
"Illegal Immigrants, Palestinian Refugees, and Their Liberal Cheerleaders: The Startling Similarities\nThe Democrat, left-wing media, and progressive activists’ current freak-out over what is at least a decade-long crisis on our southern border is not just hypocritical. **Yes, these same individuals taking to the streets in ugly mobs and threatening Trump administration officials and their families were silent when children entering the country illegally were detained under Obama.** In fact, in 2014, after visiting a detention center housing migrant children, Nancy Pelosi asserted that the issue should not be politicized, clearly fearing the poor optics. But the progressive reaction to the migrant issue reflects much more than liberal hypocrisy. It mirrors the liberal response to, and furthering of, inflammatory and intentionally misleading narratives regarding Palestinian refugees. Both crises represent the politicization of cultural and ethnic tragedies that seek to blame a strong horse for the wrongs of the true perpetrators of crimes against humanity.",247.0
"Illegal Immigrants, Palestinian Refugees, and Their Liberal Cheerleaders: The Startling Similarities\nThe Democrat, left-wing media, and progressive activists’ current freak-out over what is at least a decade-long crisis on our southern border is not just hypocritical. Yes, these same individuals taking to the streets in ugly mobs and threatening Trump administration officials and their families were silent when children entering the country illegally were detained under Obama. **In fact, in 2014, after visiting a detention center housing migrant children, Nancy Pelosi asserted that the issue should not be politicized, clearly fearing the poor optics.** But the progressive reaction to the migrant issue reflects much more than liberal hypocrisy. It mirrors the liberal response to, and furthering of, inflammatory and intentionally misleading narratives regarding Palestinian refugees. Both crises represent the politicization of cultural and ethnic tragedies that seek to blame a strong horse for the wrongs of the true perpetrators of crimes against humanity. And both crises serve as winning strategies for leftist political operatives seeking absolute power.",248.0
"Yes, these same individuals taking to the streets in ugly mobs and threatening Trump administration officials and their families were silent when children entering the country illegally were detained under Obama. In fact, in 2014, after visiting a detention center housing migrant children, Nancy Pelosi asserted that the issue should not be politicized, clearly fearing the poor optics. **But the progressive reaction to the migrant issue reflects much more than liberal hypocrisy.** It mirrors the liberal response to, and furthering of, inflammatory and intentionally misleading narratives regarding Palestinian refugees. Both crises represent the politicization of cultural and ethnic tragedies that seek to blame a strong horse for the wrongs of the true perpetrators of crimes against humanity. And both crises serve as winning strategies for leftist political operatives seeking absolute power. Palestinians are portrayed as victims of an all-powerful government infringing on their human rights, notwithstanding their culpability and that of their “elected” leaders.",249.0
"In fact, in 2014, after visiting a detention center housing migrant children, Nancy Pelosi asserted that the issue should not be politicized, clearly fearing the poor optics. But the progressive reaction to the migrant issue reflects much more than liberal hypocrisy. **It mirrors the liberal response to, and furthering of, inflammatory and intentionally misleading narratives regarding Palestinian refugees.** Both crises represent the politicization of cultural and ethnic tragedies that seek to blame a strong horse for the wrongs of the true perpetrators of crimes against humanity. And both crises serve as winning strategies for leftist political operatives seeking absolute power. Palestinians are portrayed as victims of an all-powerful government infringing on their human rights, notwithstanding their culpability and that of their “elected” leaders. Similarly, Democrats are using illegal immigrants as tools to demonize Republicans, intentionally fueling their ongoing war against conservatives with the specific goal of influencing the midterm elections.",250.0
...,...
"Cameroonian-born French defensive midfielder Ibrahim Amadou attends a press conference during his presentation as a new Sevilla FC player in Seville, southern Spain, 04 July 2018. **Amadou has signed a contract for the next four seasons.**",19948.0
"Cameroonian-born French defensive midfielder Ibrahim Amadou attends a press conference during his presentation as a new Sevilla FC player in Seville, southern Spain, 04 July 2018. Amadou has signed a contract for the next four seasons. **EPA-EFE/Pepo Herrera\nSevilla FC President Jose Castro (L) and the club's director of soccer, Joaquin Caparros (R), pose with Cameroonian-born French midfielder Ibrahim Amadou (C) during his presentation as a new Sevilla FC player in Seville, southern Spain, 04 July 2018.**",19949.0
"Amadou has signed a contract for the next four seasons. EPA-EFE/Pepo Herrera\nSevilla FC President Jose Castro (L) and the club's director of soccer, Joaquin Caparros (R), pose with Cameroonian-born French midfielder Ibrahim Amadou (C) during his presentation as a new Sevilla FC player in Seville, southern Spain, 04 July 2018. **Amadou has signed a contract for the next four seasons.**",19950.0
"EPA-EFE/Pepo Herrera\nSevilla FC President Jose Castro (L) and the club's director of soccer, Joaquin Caparros (R), pose with Cameroonian-born French midfielder Ibrahim Amadou (C) during his presentation as a new Sevilla FC player in Seville, southern Spain, 04 July 2018. Amadou has signed a contract for the next four seasons. **EPA-EFE/Pepo Herrera\nCameroonian-born French defensive midfielder Ibrahim Amadou said Wednesday upon being presented as a new player for Sevilla FC that he was thrilled to have the chance to play in the Spanish league.**",19951.0


In [105]:
wawa_score_by_text = toloka_markup.set_index("ASSIGNMENT:task_id").join(wawa_score).set_index("INPUT:text")['agg_label']
wawa_score_by_text = wawa_score_by_text[~wawa_score_by_text.index.duplicated(keep='first')]
wawa_score_by_text

INPUT:text
Justice Davis told the court Ms Caputo's partial admission that Ms Davoren had "no right to the money" had raised "real questions" about Ms Davoren's "criminal responsibility for the misappropriation of the funds". Ms Davoren spent $61,250 on a Range Rover, and $354,902 of the money remained in her bank account, the court heard. **Repayment of the money will put her account into overdraft.** The couple were in a relationship for at least 14 years, since 2001, but the exact date of their separation is in dispute, the court heard. Ms Davoren was also bookkeeper for at least one of Mr Raymond's companies, and withdrew the money via electronic banking. The court heard Ms Davoren was set to be paid $2,475,000 by Mr Raymond in a separation agreement, but this agreement was "not enforceable", and had been changed. Justice Davis ruled she took the $520,186 from Mr Raymond's company as "a system of self help" and because it was "convenient", when she should have left it to the court 