# Annotated dataset

Import and merge files from all annotators:

In [2]:
import pandas as pd

df_karina = pd.read_excel("Annotation_Karina.xlsx")
df_yana = pd.read_excel("Annotation_Yana.xlsx")
df_polina = pd.read_excel("Annotation_Polina.xlsx")

In [3]:
common_cols = ['dataset','id_a', 'id_q', 'meta.pair_idx', 'text_q', 'text_a']
annotated = pd.merge(df_karina, df_yana, how='left', left_on=common_cols, right_on=common_cols)
annotated = pd.merge(annotated, df_polina, how='left', left_on=common_cols, right_on=common_cols)

annotated.columns = common_cols + ['if_q_1', 'avoid_rate_1', 'avoid_type_1', 
                                  'if_q_2', 'avoid_rate_2', 'avoid_type_2',
                                  'if_q_3', 'avoid_rate_3', 'avoid_type_3']
print(annotated.shape)
annotated.head(1)

(500, 15)


Unnamed: 0,dataset,id_a,id_q,meta.pair_idx,text_q,text_a,if_q_1,avoid_rate_1,avoid_type_1,if_q_2,avoid_rate_2,avoid_type_2,if_q_3,avoid_rate_3,avoid_type_3
0,TI,1511_8.a,1511_8.q,1511_8,Which part of the match do you think was the r...,"I don't know. I think, obviously, the second s...",Q,0.0,,Q,1.0,,Q,0.0,


Remove non-questions based on majority value, count the average rate and type of avoidance:

In [4]:
# Filter non-questions
ann = annotated[(annotated[['if_q_1','if_q_2','if_q_3']] == 'NQ').sum(axis=1) < 2]
ann = ann.reset_index()
ann.shape[0]

424

In [5]:
# Count average rate of avoidance
ann['avoid_rate_avg'] = (ann[['avoid_rate_1','avoid_rate_2','avoid_rate_3']]).mean(axis=1)

In [6]:
# Determine type of avoidance based on majority choice
def conditions(s):
    flight = (s == 'Flight').sum()
    fight = (s == 'Fight').sum()
    
    if flight > fight:
        return 'Flight'
    elif flight < fight:
        return 'Fight'
    # case when flight=fight
    elif flight > 0:
        return 'Undetermined'
    else:
        return 'NaN'


ann['avoid_type_avg'] = ann[['avoid_type_1','avoid_type_2','avoid_type_3']].apply(conditions, axis=1)

## Statistics

There are 424 QA-pairs in total.

In [7]:
# One of the annotators marked it as a non-question
ann[(ann[['if_q_1','if_q_2','if_q_3']] == 'NQ').any(axis=1)].shape[0]

83

In [8]:
print("Rather avoidance:")
print(ann[ann['avoid_rate_avg'] > 2.0].shape[0])
print("Rather non-avoidance:")
print(ann[ann['avoid_rate_avg'] < 2.0].shape[0])
print("Undetermined:")
print(ann[ann['avoid_rate_avg'] == 2.0].shape[0])

Rather avoidance:
174
Rather non-avoidance:
228
Undetermined:
22


In [None]:
# Tails
print("Definitely avoidance:")
print(ann[ann['avoid_rate_avg'] > 3.5].shape[0])
print("Definitely non-avoidance:")
print(ann[ann['avoid_rate_avg'] < 0.5].shape[0])

Definitely avoidance:
55
Definitely non-avoidance:
135


In [9]:
def type_stats(df):
    print("Fight cases:")
    print(df[df['avoid_type_avg'] == 'Fight'].shape[0])
    print("Flight cases:")
    print(df[df['avoid_type_avg'] == 'Flight'].shape[0])
    print("Undetermined cases:")
    print(df[df['avoid_type_avg'] == 'Undetermined'].shape[0])
    print("Not rated by any annotator:")
    print(df[df['avoid_type_avg'] == 'NaN'].shape[0])
    
type_stats(ann)

Fight cases:
30
Flight cases:
204
Undetermined cases:
23
Not rated by any annotator:
167


In [10]:
# For rather avoidance subset
ann_avoided = ann[ann['avoid_rate_avg'] > 2.0]
type_stats(ann_avoided)

Fight cases:
21
Flight cases:
131
Undetermined cases:
22
Not rated by any annotator:
0


In [11]:
# One of the annotators marked it as a non-question
ann_nq = ann_avoided[(ann_avoided[['if_q_1','if_q_2','if_q_3']] == 'NQ').any(axis=1)]
print(ann_nq.shape[0])
print()
type_stats(ann_nq)

51

Fight cases:
5
Flight cases:
38
Undetermined cases:
8
Not rated by any annotator:
0


## Create files

In [None]:
ann.to_csv('Avoidance_annotated.csv', index=False)

ann_avoided.to_csv('Avoidance_annotated_avoided.csv', index=False)

ann_avoided[ann_avoided['avoid_type_avg'] == 'Fight'].to_csv('Avoidance_annotated_avoided_fight.csv', index=False)
ann_avoided[ann_avoided['avoid_type_avg'] == 'Flight'].to_csv('Avoidance_annotated_avoided_flight.csv', index=False)

ann_avoided[ann_avoided['avoid_type_avg'] == 'Undetermined'].to_csv('Avoidance_annotated_avoided_undetermined.csv', index=False)

ann_notavoided = ann[ann['avoid_rate_avg'] < 2.0]
ann_notavoided.to_csv('Avoidance_annotated_notavoided.csv', index=False)