In [2]:
import pandas as pd
import datasets
from collections import defaultdict, Counter

TODO:
- An introduction
- lang detection

In [36]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = "@user" if t.startswith("MENTION") else t
        t = "@user" if t.startswith("<user>") else t
        t = "@user" if t.startswith("@USER") else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


# Measuring-hate-speech (a)

In [91]:
dataset = datasets.load_dataset(
    'ucberkeley-dlab/measuring-hate-speech', 'binary')
df = dataset['train'].to_pandas()


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-c32713cabe528196
Reusing dataset parquet (/home/dimosthenis/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 186.08it/s]


In [92]:
# hate_speech_score - continuous hate speech measure, where higher = more hateful and lower = less hateful.
#  > 0.5 is approximately hate speech,
#  < -1 is counter or supportive speech,
#  and -1 to +0.5 is neutral or ambiguous.
def map_label(x):
    if x >= -1 and x <= 0.5:
        label = 0  # neutral/ambiguous
    elif x > 0.5:
        label = 1  # hate
    elif x < -1:
        label = -1  # not hate
    return label


df['label'] = df['hate_speech_score'].apply(map_label)


In [93]:
# keep only entries from Twitter
df = df[df['platform'] == 2]

In [94]:
# check for duplicates
df['comment_id'].nunique() == len(df)

False

In [95]:
# keep each tweet only once
df = df.groupby('comment_id').nth(0)

df = df.reset_index()

In [96]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [97]:
# save dataset
df['dataset'] = "a"
df['multilabel'] = None
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/measuring_hate_speech.csv', index=False)

# Call me sexist, but (b)

In [114]:
annotations = pd.read_csv('./new_datasets/Call me sexist, but/sexism_annotations.csv')
data = pd.read_csv('./new_datasets/Call me sexist, but/sexism_data.csv')

In [115]:
data['dataset'].unique()

array(['other', 'callme', 'benevolent', 'scales', 'hostile'], dtype=object)

In [116]:
data['sexist'].value_counts()

False    11822
True      1809
Name: sexist, dtype: int64

In [117]:
# map classes
data['multilabel'] = data['sexist']
data['multilabel'] = data['multilabel'].replace({True:'sexist', False: None})
data['label'] = data['sexist'].replace({True: 1, False: -1})


In [118]:
# clean text
data['text'] = data['text'].apply(preprocess)


In [119]:
 # save dataset
data['dataset'] = "b"
data[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/call_me_sexist.csv', index=False)


In [120]:
data[['dataset', 'text', 'label', 'multilabel']]['multilabel'].value_counts()


sexist    1809
Name: multilabel, dtype: int64

# Hate Towards the Political Opponent (c)

In [122]:
test = pd.read_csv('./new_datasets/Hate Towards Political Oponent/test.tsv',sep='\t')
train = pd.read_csv('./new_datasets/Hate Towards Political Oponent/train.tsv', sep='\t')


In [125]:
test['HOF'].unique()

array(['Non-Hateful', 'Hateful'], dtype=object)

In [126]:
df = pd.concat([test,train])

In [127]:
df

Unnamed: 0,text,Trump,Biden,West,HOF
0,There are people I follow who 18 months ago we...,Neither,Neutral mentions,Neither,Non-Hateful
1,@realDonaldTrump Why would the government stop...,Against,Favor,Neither,Non-Hateful
2,Preach brotha! 🙌 #BidenHarris2020 https://t.co...,Neither,Favor,Neither,Non-Hateful
3,@jrgaillot @JoeBiden class act #GOTV #genz #Mi...,Neither,Favor,Neither,Non-Hateful
4,"Okay white women, to the 53% of you who made a...",Against,Favor,Neither,Hateful
...,...,...,...,...,...
2395,Just lost a ton of followers again. Looks like...,Favor,Neither,Neither,Non-Hateful
2396,@NovumQuid @OpenMothersMale @MikeSington I hav...,Neither,Mixed,Neither,Non-Hateful
2397,@TheLeoTerrell @SenatorLoeffler @realDonaldTru...,Favor,Neither,Neither,Non-Hateful
2398,It’s too bad that at a time when we’re unemplo...,Neither,Favor,Neither,Non-Hateful


In [128]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [129]:
# map labels
df['label'] = df['HOF'].replace({'Non-Hateful': 0, 'Hateful':1})
df['multilabel'] = None
df['dataset'] = 'c'

In [130]:
# save dataset
df['dataset'] = "b"
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hate_towards_political.csv', index=False)


# HateXplain (d)

In [25]:
df = pd.read_json('./new_datasets/HateXplain/dataset.json', orient='index')


In [26]:
df['post_id'].nunique() == len(df)


True

In [27]:
df['text'] = df['post_tokens'].apply(lambda x: " ".join(x))

In [28]:
# normalise text
df['text'] = df['text'].apply(preprocess)

In [29]:
def majority_rule_class(x):
    results_class = {"normal": 0, "offensive": 0, "hatespeech": 0}
    #results_target = default
    for annotator in x:
        annon_label = annotator['label']
        results_class[annon_label] += 1

    majority_class = max(results_class, key=results_class.get)

    return majority_class


def majority_rule_target(x):
    results_target = defaultdict(lambda x: 0)
    
    for annotator in x:
        annon_targets = annotator['target']
        annon_targets = Counter(annon_targets)
        
        results_target.update(annon_targets)

    results_target.pop("None", 0)

    # if more than one targets just take the first
    if len(results_target) > 0:
       return list(results_target.keys())[0]
    else:
        return None

In [30]:
# get labels/mutlilabels
df['label'] = df['annotators'].apply(majority_rule_class)
df['multilabel'] = df['annotators'].apply(majority_rule_target)


In [31]:
# map label
# we map offensive to 1 as the majority of them can be considered hatespeech
df['label'] = df['label'].replace({'normal': 0, 'hatespeech': 1, 'offensive':1})
df['dataset'] = 'd'

# set multilabel to None if not hatespeech
df.loc[df[df['label'] == 0].index, 'multilabel'] = None


In [32]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hateXplain.csv', index=False)


#  Offense Eval (e)
- Alt title: Predicting the Type and Target of Offensive Posts in Social Media
- only training available

In [106]:
df = pd.read_csv('./new_datasets/Predicting the Type and Target of Offensive Posts in Social Media/offenseval-training-v1.tsv', sep='\t')

In [68]:
# clean text
df['text'] = df['tweet'].apply(preprocess)

In [69]:
# consider hatesppech only cases where subtask_c = GRP, i.e. offensive targeted to group
df['label'] = df.apply(lambda x: 1 if x['subtask_a'] == 'OFF' and x['subtask_c'] == 'GRP' else 0, axis=1)

df['multilabel'] = None
df['dataset'] = 'e'

In [70]:
# save dataset 
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/offense_eval.csv', index=False)


# Automated Hate Speech Detection and the Problem of Offensive Language (f)

In [107]:
df = pd.read_csv('./new_datasets/Automated Hate Speech Detection and the Problem of Offensive Language/labeled_data.csv')

In [108]:
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [75]:
# clean text
df['text'] = df[f'tweet'].apply(preprocess)

# map labels
df['label'] = df['class'].replace({2:0, 1:0, 0:1})
df['multilabel'] = None
df['dataset'] = 'f'

In [76]:
df['label'].value_counts()

0    23353
1     1430
Name: label, dtype: int64

In [77]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/automated_hate_speech.csv', index=False)


#  Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter (g)

- HAVE TO SCRAP TWEETS
- overlap with h

In [110]:
df = pd.read_csv('./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016.csv', header=None)
df.columns = ['id', 'label']


In [111]:
df

Unnamed: 0,id,label
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
3,572334712804384768,racism
4,572332655397629952,racism
...,...,...
16902,576359685843861505,none
16903,576612926838046720,none
16904,576771329975664640,none
16905,560595245814267905,none


# Are You a Racist or Am I Seeing Things? (h)
- Have to scrap tweets
- overlap with (g)

In [112]:
df2 = pd.read_csv(
    './new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016.csv', sep='\t')
#df.columns = ['id', 'label']


  df2 = pd.read_csv(


In [113]:
cols = ['TweetID', 'Expert', 'Amateur_0', 'Amateur_1']
df2 = df2[cols]


In [119]:
df2 = df2.reset_index()

In [120]:
df2.columns = ['id', 'TweetID', 'Expert', 'Amateur_0', 'Amateur_1']


In [128]:
len(df), len(df2)

(16907, 6909)

In [131]:
df[df['id'].isin(df2['id'])]

Unnamed: 0,id,label
0,572342978255048705,racism
1,572341498827522049,racism
2,572340476503724032,racism
1970,572346080911736832,sexism
1971,572348198062170112,sexism
...,...,...
16823,576180808504848384,none
16824,576255143101882368,none
16825,576470303964065792,none
16826,576481461424934912,none


In [126]:
len(df)

16907

# When Does a Compliment Become Sexist? Analysis and Classification of Ambivalent Sexism Using Twitter Data (i)
- included in Call me sexist but (b)

In [None]:
# row 41 in gsheets

# 