In [1]:
import pandas as pd
import datasets
from collections import defaultdict, Counter
#from twarc import Twarc2, expansions
import configparser
import json
import numpy as np

TODO:
- An introduction
- lang detection -> https://fasttext.cc/docs/en/language-identification.html

In [2]:
# Get Twitter api credentials
config = configparser.RawConfigParser()
config.read("/home/dimosthenis/.twitter_credentials.ini")

BEARER_TOKEN = config['twitter_api']['bearer_token']

CONSUMER_KEY = config['twitter_api']['consumer_key']
CONSUMER_SECRET = config['twitter_api']['consumer_secret']

ACCESS_TOKEN = config['twitter_api']['access_token']
ACCESS_SECRET = config['twitter_api']['access_secret']

In [3]:
# SET LABELS
HATE = 1
NOT_HATE = 0

In [4]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = "@user" if t.startswith("MENTION") else t
        t = "@user" if t.startswith("<user>") else t
        t = "@user" if t.startswith("@USER") else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)



def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


# Measuring-hate-speech (a)

In [5]:
dataset = datasets.load_dataset(
    'ucberkeley-dlab/measuring-hate-speech', 'binary')
df = dataset['train'].to_pandas()


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-c32713cabe528196
Reusing dataset parquet (/home/dimosthenis/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 159.19it/s]


In [6]:
# check for duplicates
df['comment_id'].nunique() == len(df)

False

In [7]:
# hate_speech_score - continuous hate speech measure, where higher = more hateful and lower = less hateful.
#  > 0.5 is approximately hate speech,
#  < -1 is counter or supportive speech,
#  and -1 to +0.5 is neutral or ambiguous.
def map_label(x):
    if x >= -1 and x <= 0.5:
        label = 999  # neutral/ambiguous
    elif x > 0.5:
        label = HATE  # hate
    elif x < -1:
        label = NOT_HATE  # not hate
    return label


# get label
df['label'] = df['hate_speech_score'].apply(map_label)

# keep only entries from Twitter
df = df[df['platform'] == 2]

# ignore ambiguous
df = df[df['label'].isin([0,1])]


In [8]:
# not all tweets are annotated by the same amount of coders
df.groupby('comment_id')['label'].count()

comment_id
20071    1
20072    3
20073    1
20075    3
20076    3
        ..
40063    2
40065    3
40066    2
40069    2
40070    4
Name: label, Length: 10685, dtype: int64

In [9]:
# we consider tweets with at least 2 coders and where there is no tie (i.e. Hate, Not_hate)
def majority_class(x):
    label_count = Counter(x)
    # sort voting
    label_count = {k: v for k, v in sorted(label_count.items(), key=lambda item: item[1])}
    label_count_sorted = list(label_count.items())

    majority_label = label_count_sorted[0][0]
    # get second label if it exists
    if len(label_count) > 1:
        second_label = label_count_sorted[1][0]
    else:
        second_label = None
    
    # at least two coders agree
    if label_count[majority_label] >= 2:
        # if second label there is no tie
        if second_label and majority_label > second_label:
            return majority_label
        elif not second_label:
            return majority_label
        else:
            return None # there is a tie between major and second
    else:
        return None

# map aggregated labels
df = df.set_index('comment_id')
df['label_aggregated'] = df.groupby('comment_id')['label'].apply(majority_class)
df = df.reset_index()

In [10]:
# remove tweets with no agreement
print(f"Total tweets before: {len(df)}")
df = df[df['label_aggregated'].notnull()]
print(f"Total tweets after: {len(df)}")

Total tweets before: 24073
Total tweets after: 20996


In [11]:
# now aggregated targets

# gender here includes transgender too
targets = ['target_race', 'target_religion', 'target_origin', 'target_gender',
           'target_sexuality', 'target_age', 'target_disability']

df = df.set_index('comment_id')

for t in targets:
    df[f"{t}_aggregated"] = df.groupby('comment_id')[t].apply(majority_class)

df = df.reset_index()

In [12]:
# no need of all annotators now -> keep each tweet only once
df = df.groupby('comment_id').nth(0)

df = df.reset_index()

In [13]:
# consider target only if it is unique (i.e not race & religion)
targets_aggr = [f"{x}_aggregated" for x in targets]

# how many targets in each tweet
df['targets#'] = df[targets_aggr].sum(axis=1)

In [14]:
# clean label
df = df.drop('label', axis=1) # to make sure
df['label'] = df['label_aggregated']

# clean multilabel
# only tweets with 1 target
idx_multilabel = df[df['targets#'] == 1].index

# initialize column
df['multilabel'] = None


def get_target(col):
    # check which targ_aggr is True
    for x in targets_aggr:
        if col[x]:
            return x

# consider only the idx_multilabel            
df.loc[idx_multilabel, 'multilabel'] = df.loc[idx_multilabel].apply(get_target, axis=1)

In [15]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [16]:
df['label'].unique()

array([0., 1.])

In [17]:
df['multilabel'].unique()

array(['target_gender_aggregated', None, 'target_sexuality_aggregated',
       'target_religion_aggregated', 'target_race_aggregated',
       'target_origin_aggregated', 'target_age_aggregated',
       'target_disability_aggregated'], dtype=object)

In [18]:
# save dataset
df['dataset'] = "a"
df['multilabel'] = None
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/measuring_hate_speech.csv', index=False)

# Call me sexist, but (b)

In [19]:
annotations = pd.read_csv('./new_datasets/Call me sexist, but/sexism_annotations.csv')
data = pd.read_csv('./new_datasets/Call me sexist, but/sexism_data.csv')

In [20]:
data['dataset'].unique()

array(['other', 'callme', 'benevolent', 'scales', 'hostile'], dtype=object)

In [21]:
data['sexist'].value_counts()

False    11822
True      1809
Name: sexist, dtype: int64

In [22]:
# check for duplicates
data['id'].nunique() == len(data)

True

In [23]:
# map classes
data['multilabel'] = data['sexist']
data['multilabel'] = data['multilabel'].replace({True:'sexist', False: None})
data['label'] = data['sexist'].replace({True: HATE, False: NOT_HATE})


In [24]:
# clean text
data['text'] = data['text'].apply(preprocess)


In [25]:
data['label'].unique()

array([0, 1])

In [26]:
data['multilabel'].unique()

array([None, 'sexist'], dtype=object)

In [27]:
 # save dataset
data['dataset'] = "b"
data[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/call_me_sexist.csv', index=False)


In [28]:
data['label'].value_counts()


0    11822
1     1809
Name: label, dtype: int64

# Hate Towards the Political Opponent (c)

In [29]:
test = pd.read_csv('./new_datasets/Hate Towards Political Oponent/test.tsv',sep='\t')
test['split'] = 'test'
train = pd.read_csv('./new_datasets/Hate Towards Political Oponent/train.tsv', sep='\t')
train['split'] = 'train'

In [30]:
test['HOF'].unique()

array(['Non-Hateful', 'Hateful'], dtype=object)

In [31]:
df = pd.concat([test,train])

In [32]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [33]:
# map labels
df['label'] = df['HOF'].replace({'Non-Hateful': NOT_HATE, 'Hateful':HATE})
df['multilabel'] = None
df['dataset'] = 'c'

In [34]:
df['label'].unique()

array([0, 1])

In [35]:
df['multilabel'].unique()

array([None], dtype=object)

In [36]:
# save dataset
df['dataset'] = "b"
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hate_towards_political.csv', index=False)


In [37]:
df['label'].value_counts()

0    2648
1     352
Name: label, dtype: int64

# HateXplain (d)


- target: at least two annotators
- target: instead of random if tie on the majority and not subclasses (e.g. jew & muslim)-> ignore

In [385]:
df = pd.read_json('./new_datasets/HateXplain/dataset.json', orient='index')

In [386]:
# sanity check
df['post_id'].nunique() == len(df)

True

In [387]:
# join tokens to test
df['text'] = df['post_tokens'].apply(lambda x: " ".join(x))

In [388]:
# normalise text
df['text'] = df['text'].apply(preprocess)

In [389]:
len(df)

20148

In [390]:
def majority_rule_class(x):
    label_count = defaultdict(lambda : 0)
    for annotator in x:
        annon_label = annotator['label']
        label_count[annon_label] += 1

    # sort voting
    label_count = {k: v for k, v in sorted(label_count.items(), key=lambda item: item[1])}
    label_count_sorted = list(label_count.items())

    majority_label = label_count_sorted[0][0]
    # get second label if it exists
    if len(label_count) > 1:
        second_label = label_count_sorted[1][0]
    else:
        second_label = None
    
    # at least two coders agree
    if label_count[majority_label] >= 2:
        # if second label there is no tie
        if second_label and majority_label > second_label:
            return majority_label
        elif not second_label:
            return majority_label
        else:
            return None # there is a tie between major and second
    else:
        return None 

def count_targets(x):
    target_count = Counter()
    
    for annotator in x:
        annon_targets = annotator['target']
        annon_targets = Counter(annon_targets)
        
        target_count += annon_targets
    return target_count

# combine classes (i.e christian with jeweish)
def majority_rule_target(x, target_map):
    target_count = Counter()
    
    
    for annotator in x:
        annon_targets = annotator['target']
        annon_targets = Counter(annon_targets)
        # combine targets (i.e christian + islam -> religion)
        annon_targets_updated = {}
        for k,v in annon_targets.items():
            annon_targets_updated[target_map[k]]  = v

        target_count += Counter(annon_targets_updated)
    
    # sort target_count
    target_count = {k: v for k, v in sorted(target_count.items(), key=lambda item: item[1], reverse=True)}
    target_count_sorted = list(target_count.items())

    majority_label = target_count_sorted[0][0]
    # get second label if it exists
    if len(target_count) > 1:
        second_label = target_count_sorted[1][0]
    else:
        second_label = None

    # at least two coders agree
    if target_count[majority_label] >= 2:
        # if second label there is no tie or there is no second label
        if (second_label and majority_label > second_label) or (not second_label):
            return majority_label
        else:
            return None # there is a tie between major and second
    else:
        return None


In [391]:
# check available targets
all_targets = Counter()
for entry in df['annotators'].apply(count_targets):
    all_targets += entry

print(all_targets.keys())


dict_keys(['None', 'African', 'Asian', 'Caucasian', 'Women', 'Jewish', 'Homosexual', 'Islam', 'Other', 'Hispanic', 'Refugee', 'Men', 'Arab', 'Disability', 'Minority', 'Nonreligious', 'Indigenous', 'Indian', 'Economic', 'Christian', 'Heterosexual', 'Bisexual', 'Hindu', 'Buddhism', 'Asexual'])


In [392]:
# combine different targets
target_map = {
    'None': 'none',
    'African': 'racism',
    'Asian': 'racism',
    'Caucasian': 'racism',
    'Women': 'gender',
    'Jewish': 'religion',
    'Homosexual': 'sexuality',
    'Islam': 'religion',
    'Other': 'other',
    'Hispanic': 'racism',
    'Refugee': 'refugee', # not sure about this
    'Men': 'gender',
    'Arab': 'racism',
    'Disability': 'disability',
    'Minority': 'minority', # not sure
    'Nonreligious': 'religious',
    'Indigenous': 'racism',
    'Indian': 'racism',
    'Economic': 'economic',
    'Christian': 'religion',
    'Heterosexual': 'sexuality',
    'Bisexual': 'sexuality',
    'Hindu': 'religion',
    'Buddhism': 'religion',
    'Asexual': 'sexuality'
    }

In [393]:
# get labels/mutlilabels
df['label'] = df['annotators'].apply(majority_rule_class)
df['multilabel'] = df['annotators'].apply(lambda x: majority_rule_target(x, target_map))
# ensure consistency
df['multilabel'] = df['multilabel'].replace({'None':None})

In [394]:
df['multilabel'].value_counts()

none          4472
racism        3609
religion      3404
sexuality     1655
refugee        755
other          463
gender         365
disability       9
Name: multilabel, dtype: int64

In [171]:
# drop tweets with no agreement 
df = df[df['label'].notnull()]
print(len(df))

9845


In [172]:
# map label
df['label'] = df['label'].replace({'normal': NOT_HATE, 'hatespeech': HATE})
# keep only hate and not-hate
df = df[df['label'].isin([HATE, NOT_HATE])]
# we map offensive to 1 only if it is targeted to a group (i.e. multilabel.notnull())
df.loc[df[df['multilabel'].notnull()].index, 'label'] = df.loc[df[df['multilabel'].notnull()].index, 'label'].replace({'offensive': HATE})
df['dataset'] = 'd'

# set multilabel to None if not hatespeech
df.loc[df[df['label'] == NOT_HATE].index, 'multilabel'] = None


In [173]:
df['multilabel'].value_counts()

racism       1076
religion     1031
sexuality     193
refugee        24
other           7
gender          4
Name: multilabel, dtype: int64

In [174]:
df['label'].value_counts()

0    5124
1    2960
Name: label, dtype: int64

In [175]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hateXplain.csv', index=False)


#  Offense Eval (e)
- Alt title: Predicting the Type and Target of Offensive Posts in Social Media
- only training available

In [53]:
df = pd.read_csv('./new_datasets/Predicting the Type and Target of Offensive Posts in Social Media/offenseval-training-v1.tsv', sep='\t')

In [54]:
# clean text
df['text'] = df['tweet'].apply(preprocess)

In [55]:
# consider hatesppech only cases where subtask_c = GRP, i.e. offensive targeted to group
df['label'] = df.apply(lambda x: HATE if x['subtask_a'] == 'OFF' and x['subtask_c'] == 'GRP' else NOT_HATE, axis=1)

df['multilabel'] = None
df['dataset'] = 'e'

In [56]:
df['label'].unique(), df['multilabel'].unique()

(array([0, 1]), array([None], dtype=object))

In [57]:
# save dataset 
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/offense_eval.csv', index=False)


# Automated Hate Speech Detection and the Problem of Offensive Language (f)

  - 0 - hate speech
  - 1 - offensive  language
  - 2 - neither

In [58]:
df = pd.read_csv('./new_datasets/Automated Hate Speech Detection and the Problem of Offensive Language/labeled_data.csv')

In [59]:
# clean text
df['text'] = df[f'tweet'].apply(preprocess)

# we ingore offensive
df = df[df['class'].isin([0,2])]

# map labels
df['label'] = df['class'].replace({2:NOT_HATE, 0:HATE})
df['multilabel'] = None
df['dataset'] = 'f'


In [60]:
len(df)

5593

In [61]:
df['label'].unique(), df['multilabel'].unique()


(array([0, 1]), array([None], dtype=object))

In [62]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/automated_hate_speech.csv', index=False)


#  Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter (g)

- HAVE TO SCRAP TWEETS
- overlap with h

In [302]:
df = pd.read_csv('./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016.csv', header=None)
df.columns = ['id', 'label']


In [303]:
len(df)

16907

In [304]:
df = df.drop_duplicates(subset='id')
len(df)

16849

## Get tweets for (g)

In [305]:
tweet_ids = df['id'].values
len(tweet_ids)

16849

In [306]:
# client = Twarc2(bearer_token=BEARER_TOKEN)

# # we loop because we get a 401 error on some ids
# error_ids = []
# step = 100
# for idx in range(0, len(tweet_ids), step):
#     step_ids = tweet_ids[idx:idx+step]
#     #print(idx, step+idx)

#     try:
#         search_results = client.tweet_lookup(step_ids) 
#         file_name = "./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016_raw-tweets.json"

#         # Twarc returns all Tweets for the criteria set above, so we page through the results
#         for page in search_results:
#             # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
#             # so we use expansions.flatten to get all the information in a single JSON
#             result = expansions.flatten(page)
#             # We will open the file and append one JSON object per new line
#             with open(file_name, 'a+') as filehandle:
#                 for tweet in result:
#                     filehandle.write('%s\n' % json.dumps(tweet))
#     except:
#         error_ids.append(step_ids)                    


In [307]:
# len(error_ids)

In [308]:
# read raw tweets
tweets = pd.read_json('./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016_raw-tweets.json', lines=True)
tweets = tweets[['id', 'text']]

In [309]:
len(tweets)

10014

In [310]:
# map text
tweets = tweets.set_index('id')
df = df.set_index('id')

df['text'] = tweets['text']
df = df.reset_index()

In [311]:
df = df[df['text'].notnull()]
len(df)

7133

In [312]:
# read tweets from (h) and remove overlap
df_h = pd.read_csv(
    './new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016.csv', sep='\t')
df_h = df_h.index

  df_h = pd.read_csv(


In [313]:
# remove ids present in (h)
df = df[~df['id'].isin(df_h)]
len(df)

4492

In [314]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [315]:
# map labels
df['multilabel'] = df['label']
df['label'] = df['label'].replace({'racism':HATE, 'sexism':HATE, 'none':NOT_HATE})

In [316]:
# fix for not saving properly
df['text'] = df['text'].apply(lambda x: x.replace("\n", " "))

In [317]:
df['label'].unique(), df['multilabel'].unique(), len(df)


(array([1, 0]), array(['racism', 'sexism', 'none'], dtype=object), 4492)

In [318]:
# TODO: problem with saving correctly (at lines 709-710). have to manuall fix
# save dataset
df['dataset'] = 'g'
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hateful-symbols_or_hateful-people.csv', index=False)


# Are You a Racist or Am I Seeing Things? (h)
- Have to scrap tweets
- overlap with (g)

In [79]:
df = pd.read_csv(
    './new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016.csv', sep='\t')


  df = pd.read_csv(


In [80]:
cols = ['TweetID', 'Expert', 'Amateur_0', 'Amateur_1']
df = df[cols]
df = df.reset_index()

df.columns = ['id', 'Expert', 'Amateur_0', 'Amateur_1', 'Amateur_2']

## Getting tweets for (h)

In [81]:
tweet_ids = df['id'].values

In [82]:
# client = Twarc2(bearer_token=BEARER_TOKEN)

# search_results = client.tweet_lookup(tweet_ids) 
# file_name = "./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016_raw-tweets.json"

# # Twarc returns all Tweets for the criteria set above, so we page through the results
# for page in search_results:
#     # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
#     # so we use expansions.flatten to get all the information in a single JSON
#     result = expansions.flatten(page)
#     # We will open the file and append one JSON object per new line
#     with open(file_name, 'a+') as filehandle:
#         for tweet in result:
#             filehandle.write('%s\n' % json.dumps(tweet))
            


In [83]:
# read tweets scrapped
tweets = pd.read_json('./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016_raw-tweets.json', lines=True)
tweets = tweets[['id','text']]

In [84]:
# map text to tweets
df = df.set_index('id')
tweets = tweets.set_index('id')

df['text'] = tweets['text']
df = df.reset_index()

In [85]:
# ignore tweets with no text
df = df[df['text'].notnull()]

In [86]:
# OLD WAY: CONSIDERING ONLY EXPERT
# # we consider the label provided by the Expert
# df['label'] = df['Expert']
# df['label'] = df['label'].replace({'neither':NOT_HATE, 'sexism':HATE,
#                                     'both':HATE, 'racism':HATE}
#                                 )
# # arbitary map "both" to "racism"
# df['multilabel'] = df['Expert'].replace({"both":"racism"})

# NEW Way: at least two coders
def majority_rule_class(col):
    results_class = defaultdict(lambda: 0)
    for annotator in ["Expert", "Amateur_0", "Amateur_1", "Amateur_2"]:
        results_class[col[annotator]] += 1

    majority_class = max(results_class, key=results_class.get)

    # at least two coders agree
    if results_class[majority_class] >= 2:
        return majority_class
    else:
        return None


df['label'] = df.apply(majority_rule_class, axis=1)
df['multilabel'] = df['label']
df['label'] = df['label'].replace({'neither': NOT_HATE, 'sexism': HATE, 'both':HATE, 'racism':HATE, 'link':None})


In [87]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [88]:
# ignore nan
df = df[df['label'].notnull()]
len(df)

4114

In [89]:
df['label'].unique(), df['multilabel'].unique()


(array([0., 1.]), array(['neither', 'sexism', 'both', 'racism'], dtype=object))

In [90]:
# save dataset
df['dataset'] = 'h'
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/are_you_racist_or.csv', index=False)


# When Does a Compliment Become Sexist? Analysis and Classification of Ambivalent Sexism Using Twitter Data (i)
- included in Call me sexist but (b)
- possible different annotations?
- can be found in either dataset='benevolent' or dataset='other'

In [91]:
# df_i = pd.read_csv('/home/dimos/Desktop/phd/hate_speech/new_datasets/When Does a Compliment Become Sexist/hostile_sexist.tsv')

In [92]:
# df_b[df_b['text'].str.contains('These two are revolting')]

# Overview of the Task on Automatic Misogyny Identification at IberEval 2018 (j)

- datasets are password protected

# Multilingual and Multi-Aspect Hate Speech Analysis (English) (k)

In [374]:
df = pd.read_csv('new_datasets/Multilingual and Multi-Aspect Hate Speech Analysis/en_dataset.csv')

In [375]:
len(df)

5647

In [376]:
df['target'].unique()

array(['origin', 'disability', 'gender', 'sexual_orientation', 'other',
       'religion'], dtype=object)

In [377]:
df['group'].unique()

array(['gay', 'special_needs', 'other', 'women', 'left_wing_people',
       'individual', 'immigrants', 'jews', 'muslims', 'refugees',
       'african_descent', 'indian/hindu', 'hispanics', 'asians',
       'christian', 'arabs'], dtype=object)

In [378]:
# map origin to group as it is more clear
df.loc[df[df['target'] == 'origin'].index, 'target'] = df.loc[df[df['target'] == 'origin'].index]['group']


In [379]:
# map labels
def get_label(sentiment):
    if "hate" in sentiment:
        label = "hate"
    elif "normal" == sentiment:
        label = "normal"
    else:
        label = None
    return label

df['label'] = df['sentiment'].apply(get_label)

In [380]:
df = df[df['label'].notnull()]
df['multilabel'] = df['target']

# set multilabel to None if not hatespeech
df.loc[df[df['label'] == "normal"].index, 'multilabel'] = None

In [381]:
# normalise text
df['text'] = df['tweet'].apply(preprocess)

In [382]:
# map labels
df['label']  = df['label'].replace({'normal':NOT_HATE, 'hate':HATE})

df['dataset'] = 'k'

df['label'].value_counts()

1    1278
0     661
Name: label, dtype: int64

In [383]:
df['label'].unique(), df['multilabel'].unique()


(array([1, 0]),
 array(['disability', 'other', None, 'gender', 'individual',
        'sexual_orientation', 'african_descent', 'immigrants', 'gay',
        'special_needs', 'refugees', 'jews', 'hispanics', 'women',
        'left_wing_people', 'indian/hindu', 'muslims', 'religion',
        'asians', 'arabs', 'christian'], dtype=object))

In [384]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/multilingual_and_multi_aspect.csv', index=False)


# Exploring Hate Speech Detection in Multimodal Publications (l)

- multimodal both text & image


# hatEval, SemEval-2019 Task 5: Multilingual Detection of Hate Speech Against Immigrants and Women in Twitter (English) (m)

In [347]:
def format_file(text_f, label_f):
    with open(text_f) as f:
        text = f.readlines()
        text = [x.strip('\n') for x in text]

    with open(label_f) as f:
        labels = f.readlines()
        labels = [int(x.strip('\n')) for x in labels]

    data = [(i,z) for i,z in zip(text,labels)]

    data = pd.DataFrame(data, columns=['text', 'label'])
    return data

In [348]:
df_train = format_file('./new_datasets/hatEval semEval-2019/train_text.txt', 
            './new_datasets/hatEval semEval-2019/train_labels.txt')
df_train['split'] = "train"
df_test = format_file('./new_datasets/hatEval semEval-2019/test_text.txt', 
            './new_datasets/hatEval semEval-2019/test_labels.txt')
df_test['split'] = "test"
df_val = format_file('./new_datasets/hatEval semEval-2019/val_text.txt', 
            './new_datasets/hatEval semEval-2019/val_labels.txt')
df_val['split'] = "val"

In [349]:
df = pd.concat([df_train, df_test, df_val])
len(df)

12970

In [351]:
# we have entries with no text. remove them
df = df[df['text'].str.len() > 0]

In [352]:
# clean text
df['text'] = df['text'].apply(preprocess)

# map labels
df['label'] = df['label'].replace({0:NOT_HATE, 1:HATE})
df['multilabel'] = None
df['dataset'] = "m"

# save dataset 
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hatEval2019.csv', index=False)

In [353]:
df['label'].unique(), df['multilabel'].unique()


(array([0, 1]), array([None], dtype=object))

# Overview of the HASOC track at FIRE 2019:Hate Speech and Offensive Content Identification in Indo-European Languages (n)


In [108]:
df  = pd.read_csv('new_datasets/Overview of the HASOC track at FIRE 2019:Hate Speech and Offensive Content Identification in Indo-European Languages/english_dataset.tsv', sep='\t')

In [109]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [110]:
# keep only NONE and HATE
df = df[df['task_2'].isin(['NONE', 'HATE'])]
df['label'] = df['task_2'].replace({'NONE':NOT_HATE, 'HATE':HATE})
df['split'] = 'train'
df['dataset'] = 'n'

In [111]:
# read test data
test = pd.read_csv('new_datasets/Overview of the HASOC track at FIRE 2019:Hate Speech and Offensive Content Identification in Indo-European Languages/hasoc2019_en_test-2919.tsv',sep='\t')

In [112]:
# specify test data
test_ids = test['text_id'].values

df.loc[df[df['text_id'].isin(test_ids)].index, 'split'] = 'test'

df['multilabel'] = None


In [113]:
df['label'].unique(), df['multilabel'].unique()


(array([0, 1]), array([None], dtype=object))

In [114]:
# save dataset
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hasoc_fire2019.csv', index=False)

# Detecting East Asian Prejudice on Social Media (o)

In [115]:
df = pd.read_csv('./new_datasets/Detecting East Asian Prejudice on Social Media/hs_AsianPrejudice_20kdataset_cleaned_anonymized.tsv', sep='\t')
len(df)

20000

In [116]:
len(df) == df['id'].nunique()

True

In [117]:
#  majority rule
def majority_rule_class(col):
    results_class = defaultdict(lambda : 0)
    for annotator in ['annot1', 'annot2', 'expert']:
        results_class[col[annotator]] += 1

    
    majority_class = max(results_class, key=results_class.get)
    
    # at least two coders agree
    if results_class[majority_class] >= 2:
        return majority_class
    else:
        return None

df['label'] = df.apply(majority_rule_class, axis=1)


In [118]:
# consider only none_of_the_above & entity_directed_hostility
df = df[df['label'].isin(['none_of_the_above', 'entity_directed_hostility'])]

In [119]:
# clean text
df['text'] = df['text'].apply(preprocess)

# map labels
df['label'] = df['label'].replace({'none_of_the_above':NOT_HATE, 'entity_directed_hostility':HATE})


# mutlilabel 
df['multilabel'] = None
df.loc[df[df['label'] == HATE].index, 'multilabel'] = "racism"



In [120]:
df['label'].unique(), df['multilabel'].unique()

(array([0, 1]), array([None, 'racism'], dtype=object))

In [121]:
# save dataset
df['dataset'] = 'o'
df['split'] = None
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/east_asian_prejudice.csv', index=False)


# Large Scale Crowdsourcing and Characterization of Twitter Abusive Behavior (p)

In [122]:
df = pd.read_csv('new_datasets/Large Scale Crowdsourcing and Characterization of Twitter Abusive BehaviorLarge Scale Crowdsourcing and Characterization of Twitter Abusive Behavior/hatespeechtwitter.tab', '\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [123]:
df['maj_label'].unique()

array(['abusive', 'normal', 'hateful', 'spam', nan], dtype=object)

In [124]:
# consider only "hateful" and "normal"
df = df[df['maj_label'].isin(['hateful', 'normal'])]

# take all "hateful" and sample the same size of "normal"
hate = df[df['maj_label'] == 'hateful']
normal = df[df['maj_label'] == 'normal']
normal = normal.sample(n=len(hate),random_state=2)

# group new sets
df = pd.concat([hate,normal])

## Get tweets for (p)

In [125]:
tweet_ids = df['tweet_id'].values
len(tweet_ids)

7270

In [126]:
# client = Twarc2(bearer_token=BEARER_TOKEN)

# search_results = client.tweet_lookup(tweet_ids) 
# file_name = "new_datasets/Large Scale Crowdsourcing and Characterization of Twitter Abusive BehaviorLarge Scale Crowdsourcing and Characterization of Twitter Abusive Behavior/hatespeechtwitter_raw-tweets.json"

# # Twarc returns all Tweets for the criteria set above, so we page through the results
# for page in search_results:
#     # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
#     # so we use expansions.flatten to get all the information in a single JSON
#     result = expansions.flatten(page)
#     # We will open the file and append one JSON object per new line
#     with open(file_name, 'a+') as filehandle:
#         for tweet in result:
#             filehandle.write('%s\n' % json.dumps(tweet))
            


In [127]:
# read tweets
tweets = pd.read_json("new_datasets/Large Scale Crowdsourcing and Characterization of Twitter Abusive BehaviorLarge Scale Crowdsourcing and Characterization of Twitter Abusive Behavior/hatespeechtwitter_raw-tweets.json", lines=True)
# drop tweets scrapped twice
tweets = tweets.drop_duplicates(subset='id')
tweets = tweets[['id', 'text']]

In [128]:
# map text to tweets
df = df.set_index('tweet_id')
tweets = tweets.set_index('id')

df['text'] = tweets['text']
df = df.reset_index()

In [129]:
# drop tweets with no texxt
df = df[df['text'].notnull()]
len(df)

2209

In [130]:
# clean text
df['text'] = df['text'].apply(preprocess)

# map labels
df['label'] = df['maj_label'].replace({'hateful':HATE, 'normal':NOT_HATE})
df['multilabel'] = None
df['split'] = None
df['dataset'] = "p"

In [131]:
df['label'].unique(), df['multilabel'].unique()

(array([1, 0]), array([None], dtype=object))

In [132]:
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/large_scale_crowdsourcing.csv', index=False)

# Twitter Sentiment Analysis (q)

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.

In [133]:
df = pd.read_csv('./new_datasets/Twitter Sentiment Analysis/train.csv')

In [134]:
df['text'] = df['tweet'].apply(preprocess)
df['label'] = df['label'].replace({0: NOT_HATE, 1:HATE})
df['multilabel'] = None
df['split'] = 'train'
df['dataset'] = 'q'

In [135]:
df['label'].unique(), df['multilabel'].unique()

(array([0, 1]), array([None], dtype=object))

In [136]:
# save dataset
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/twitter_sentiment_analysis.csv', index=False)