In [1]:
import os
import pandas as pd

## Preparing Dataset

### Notes:
- results of usable datasets are stored in
  - 'data/train_all_annotators.csv'
  - 'data/test_all_annotators.csv'
- had to remove some ambiguous data
- if annotator didn't label the text: label = -1

In [57]:
path_to_data = '../data/'

In [58]:
path_to_GHC = path_to_data + "GHC_data/"
os.listdir(path_to_GHC)

['.ipynb_checkpoints',
 'AnnotatorIAT_and_Attitudes.csv',
 'ghc_train.tsv',
 'ghc_test.tsv',
 'GabHateCorpus_annotations.tsv']

In [73]:
with open(path_to_GHC+'AnnotatorIAT_and_Attitudes.csv', 'r') as f:
    annotator_df = pd.read_csv(f)

with open(path_to_GHC+'ghc_train.tsv', 'r') as f:
    train_df = pd.read_csv(f, sep='\t')

with open(path_to_GHC+'ghc_test.tsv', 'r') as f:
    test_df = pd.read_csv(f, sep='\t')

with open(path_to_GHC+'GabHateCorpus_annotations.tsv', 'r') as f:
    ghc_annotations_df = pd.read_csv(f, sep='\t')

In [74]:
annotator_df[:2]

Unnamed: 0,Annotator,PostsAnnotated,IAT-RACE,IAT-GenderCareer,IAT-Sexuality,IAT-Religion,HCBS-NegativeBelief,HCBS-OffenderPunishment,HCBS-Deterrence,HCBS-VictimHarm
0,0,7715,0.4,0.0,1.0,0.333333,0.0,1.0,1.0,1.0
1,1,3987,0.8,1.0,0.5,0.666667,0.1875,0.733333,0.555556,0.833333


In [75]:
ghc_annotations_df[:2]

Unnamed: 0,ID,Annotator,Text,Hate,HD,CV,VO,REL,RAE,SXO,GEN,IDL,NAT,POL,MPH,EX,IM
0,27044,4,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
1,27044,15,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,


In [76]:
train_df[:2]

Unnamed: 0,text,hd,cv,vo
0,He most likely converted to islam due to his n...,0,0,0
1,So Ford lied about being a psychologist. Recor...,0,0,0


In [77]:
# some annotators did label the same text more that once.

print(len(ghc_annotations_df[ghc_annotations_df.Annotator==0]))
print(len(set(list(ghc_annotations_df[ghc_annotations_df.Annotator==0].Text))))

7715
7710


In [78]:
# pivot the annotations table so we have one row per text with all annotators labels
df = ghc_annotations_df[['Annotator','Text','Hate']]
df = df.pivot_table(index='Text', columns='Annotator', fill_value=-1)
df.reset_index(inplace=True)
df.columns = df.columns.droplevel(0)
df = df.rename(columns={'':'Text'})
df.columns.name = None

In [79]:
# as it can be seen here, some of the multi-label of one annotator are ambiguous.
for i in range(17):
    print(set(list(df[i].dropna())))

{0.0, 1.0, -1.0, 0.5}
{0, 1, -1}
{0, 1, -1}
{0, 1, -1}
{0, 1, -1}
{0.0, 1.0, -1.0, 0.5}
{0, 1, -1}
{0.0, 1.0, 0.3333333333333333, 0.5, -1.0}
{0.0, 1.0, 0.6666666666666666, -1.0}
{0, 1, -1}
{0, 1, -1}
{0.0, 1.0, -1.0, 0.5}
{0, 1, -1}
{0.0, 1.0, -1.0, 0.5}
{0, 1, -1}
{0, 1, -1}
{0, 1, -1}


In [80]:
# thus we replace this values by -1, meaning no value given
df = df.replace([0.5, 1/3, 2/3],-1)

# convert dtypes to int now
df = df.convert_dtypes()

In [81]:
# result
df.head()

Unnamed: 0,Text,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,Funkytown ‏ @hotfunkytown Jan 7 Weins...,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,0,0,-1,-1,-1,-1,-1
1,"Officers are speaking with another man, 26, ...",0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,0,-1,-1,-1,-1
2,"Primer ministro húngaro feliz de que la ""Ope...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,0,-1
3,The hand of Our Lord & Savior Jesus Christ i...,-1,-1,0,-1,-1,0,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,-1,-1
4,Who Controls the News ... controls your Mind...,-1,-1,-1,-1,-1,-1,-1,0,-1,-1,-1,1,0,-1,-1,-1,-1,-1


In [105]:
df['majority_label'] = df.loc[:,0:17].apply(
    lambda row: 
        1 if (len(row[row==0])==0 and len(row[row==1])>0) or (len(row[row==1])/len(row[row!=-1]) >= 0.5) 
        else 0, 
    axis=1)

In [107]:
#store dataframes

df.to_csv(path_to_data+'all_annotators.csv', index=False)

train_all_annotations_df = df[df.Text.isin(train_df.text)]
train_all_annotations_df.to_csv(path_to_data+'train_all_annotators.csv', index=False)

test_all_annotations_df = df[df.Text.isin(test_df.text)]
test_all_annotations_df.to_csv(path_to_data+'test_all_annotators.csv', index=False)