In [206]:
import pandas as pd
import datasets
from collections import defaultdict, Counter
#from twarc import Twarc2, expansions
import configparser
import json
import numpy as np

TODO:
- An introduction
- lang detection -> https://fasttext.cc/docs/en/language-identification.html

In [207]:
# Get Twitter api credentials
config = configparser.RawConfigParser()
config.read("/home/dimos/.twitter_credentials.ini")

BEARER_TOKEN = config['twitter_api']['bearer_token']

CONSUMER_KEY = config['twitter_api']['consumer_key']
CONSUMER_SECRET = config['twitter_api']['consumer_secret']

ACCESS_TOKEN = config['twitter_api']['access_token']
ACCESS_SECRET = config['twitter_api']['access_secret']

In [208]:
# SET LABELS
HATE = 1
NOT_HATE = 0

In [209]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = "@user" if t.startswith("MENTION") else t
        t = "@user" if t.startswith("<user>") else t
        t = "@user" if t.startswith("@USER") else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)



def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]


# Measuring-hate-speech (a)

In [210]:
dataset = datasets.load_dataset(
    'ucberkeley-dlab/measuring-hate-speech', 'binary')
df = dataset['train'].to_pandas()


Using custom data configuration ucberkeley-dlab--measuring-hate-speech-c32713cabe528196
Found cached dataset parquet (/home/dimos/.cache/huggingface/datasets/ucberkeley-dlab___parquet/ucberkeley-dlab--measuring-hate-speech-c32713cabe528196/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 23.24it/s]


In [211]:
# check for duplicates
df['comment_id'].nunique() == len(df)

False

In [212]:
# hate_speech_score - continuous hate speech measure, where higher = more hateful and lower = less hateful.
#  > 0.5 is approximately hate speech,
#  < -1 is counter or supportive speech,
#  and -1 to +0.5 is neutral or ambiguous.
def map_label(x):
    if x >= -1 and x <= 0.5:
        label = 999  # neutral/ambiguous
    elif x > 0.5:
        label = HATE  # hate
    elif x < -1:
        label = NOT_HATE  # not hate
    return label


# get label
df['label'] = df['hate_speech_score'].apply(map_label)

# keep only entries from Twitter
df = df[df['platform'] == 2]

# ignore ambiguous
df = df[df['label'].isin([0,1])]


In [213]:
# not all tweets are annotated by the same amount of coders
df.groupby('comment_id')['label'].count()

comment_id
20071    1
20072    3
20073    1
20075    3
20076    3
        ..
40063    2
40065    3
40066    2
40069    2
40070    4
Name: label, Length: 10685, dtype: int64

In [214]:
# we consider tweets with at least 2 coders and where there is no tie (i.e. Hate, Not_hate)
def majority_class(x):
    label_count = Counter(x)
    # sort voting
    label_count = {k: v for k, v in sorted(label_count.items(), key=lambda item: item[1])}
    label_count_sorted = list(label_count.items())

    majority_label = label_count_sorted[0][0]
    # get second label if it exists
    if len(label_count) > 1:
        second_label = label_count_sorted[1][0]
    else:
        second_label = None
    
    # at least two coders agree
    if label_count[majority_label] >= 2:
        # if second label there is no tie
        if second_label and majority_label > second_label:
            return majority_label
        elif not second_label:
            return majority_label
        else:
            return None # there is a tie between major and second
    else:
        return None

# map aggregated labels
df = df.set_index('comment_id')
df['label_aggregated'] = df.groupby('comment_id')['label'].apply(majority_class)
df = df.reset_index()

In [215]:
# remove tweets with no agreement
print(f"Total tweets before: {len(df)}")
df = df[df['label_aggregated'].notnull()]
print(f"Total tweets after: {len(df)}")

Total tweets before: 24073
Total tweets after: 20996


In [216]:
# now aggregated targets

# gender here includes transgender too
targets = ['target_race', 'target_religion', 'target_origin', 'target_gender',
           'target_sexuality', 'target_age', 'target_disability']

df = df.set_index('comment_id')

for t in targets:
    df[f"{t}_aggregated"] = df.groupby('comment_id')[t].apply(majority_class)

df = df.reset_index()

In [217]:
# no need of all annotators now -> keep each tweet only once
df = df.groupby('comment_id').nth(0)

df = df.reset_index()

In [218]:
# consider target only if it is unique (i.e not race & religion)
targets_aggr = [f"{x}_aggregated" for x in targets]

# how many targets in each tweet
df['targets#'] = df[targets_aggr].sum(axis=1)

In [220]:
# clean label
df = df.drop('label', axis=1) # to make sure
df['label'] = df['label_aggregated']

# clean multilabel
# only tweets with 1 target
idx_multilabel = df[df['targets#'] == 1].index

# initialize column
df['multilabel'] = None


def get_target(col):
    # check which targ_aggr is True
    for x in targets_aggr:
        if col[x]:
            return x

# consider only the idx_multilabel            
df.loc[idx_multilabel, 'multilabel'] = df.loc[idx_multilabel].apply(get_target, axis=1)

In [221]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [222]:
# save dataset
df['dataset'] = "a"
df['multilabel'] = None
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/measuring_hate_speech.csv', index=False)

# Call me sexist, but (b)

In [232]:
annotations = pd.read_csv('./new_datasets/Call me sexist, but/sexism_annotations.csv')
data = pd.read_csv('./new_datasets/Call me sexist, but/sexism_data.csv')

In [233]:
data['dataset'].unique()

array(['other', 'callme', 'benevolent', 'scales', 'hostile'], dtype=object)

In [234]:
data['sexist'].value_counts()

False    11822
True      1809
Name: sexist, dtype: int64

In [235]:
# check for duplicates
data['id'].nunique() == len(data)

True

In [236]:
# map classes
data['multilabel'] = data['sexist']
data['multilabel'] = data['multilabel'].replace({True:'sexist', False: None})
data['label'] = data['sexist'].replace({True: HATE, False: NOT_HATE})


In [237]:
# clean text
data['text'] = data['text'].apply(preprocess)


In [238]:
 # save dataset
data['dataset'] = "b"
data[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/call_me_sexist.csv', index=False)


In [242]:
data['label'].value_counts()


0    11822
1     1809
Name: label, dtype: int64

# Hate Towards the Political Opponent (c)

In [243]:
test = pd.read_csv('./new_datasets/Hate Towards Political Oponent/test.tsv',sep='\t')
test['split'] = 'test'
train = pd.read_csv('./new_datasets/Hate Towards Political Oponent/train.tsv', sep='\t')
train['split'] = 'train'

In [244]:
test['HOF'].unique()

array(['Non-Hateful', 'Hateful'], dtype=object)

In [245]:
df = pd.concat([test,train])

In [246]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [248]:
# map labels
df['label'] = df['HOF'].replace({'Non-Hateful': NOT_HATE, 'Hateful':HATE})
df['multilabel'] = None
df['dataset'] = 'c'

In [249]:
# save dataset
df['dataset'] = "b"
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hate_towards_political.csv', index=False)


In [252]:
df['label'].value_counts()

0    2648
1     352
Name: label, dtype: int64

# HateXplain (d)


- target: at least two annotators
- target: instead of random if tie on the majority and not subclasses (e.g. jew & muslim)-> ignore

In [325]:
df = pd.read_json('./new_datasets/HateXplain/dataset.json', orient='index')

In [326]:
# sanity check
df['post_id'].nunique() == len(df)

True

In [327]:
# join tokens to test
df['text'] = df['post_tokens'].apply(lambda x: " ".join(x))

In [328]:
# normalise text
df['text'] = df['text'].apply(preprocess)

In [361]:
def majority_rule_class(x):
    label_count = defaultdict(lambda : 0)
    for annotator in x:
        annon_label = annotator['label']
        label_count[annon_label] += 1

    # sort voting
    label_count = {k: v for k, v in sorted(label_count.items(), key=lambda item: item[1])}
    label_count_sorted = list(label_count.items())

    majority_label = label_count_sorted[0][0]
    # get second label if it exists
    if len(label_count) > 1:
        second_label = label_count_sorted[1][0]
    else:
        second_label = None
    
    # at least two coders agree
    if label_count[majority_label] >= 2:
        # if second label there is no tie
        if second_label and majority_label > second_label:
            return majority_label
        elif not second_label:
            return majority_label
        else:
            return None # there is a tie between major and second
    else:
        return None 


# combine classes (i.e christian with jeweish)
def majority_rule_target(x):
    target_count = defaultdict(lambda x: 0)
    
    for annotator in x:
        annon_targets = annotator['target']
        annon_targets = Counter(annon_targets)
        
        target_count.update(annon_targets)
    return target_count
    # sort voting
    target_count = {k: v for k, v in sorted(target_count.items(), key=lambda item: item[1])}
    target_count_sorted = list(target_count.items())

    majority_label = target_count_sorted[0][0]
    # get second label if it exists
    if len(target_count) > 1:
        second_label = target_count_sorted[1][0]
    else:
        second_label = None

    # at least two coders agree
    if target_count[majority_label] >= 2:
        # if second label there is no tie or there is no second label
        if (second_label and majority_label > second_label) or (not second_label):
            return majority_label
        else:
            return None # there is a tie between major and second
    else:
        return None


In [364]:
df['multilabel']

1179055004553900032_twitter                                  {'None': 1}
1179063826874032128_twitter                                  {'None': 1}
13851720_gab                                               {'Jewish': 1}
1089312238102609921_twitter                    {'None': 1, 'African': 1}
1159278048069464065_twitter                                  {'None': 1}
                                                 ...                    
9872639_gab                                      {'Other': 1, 'None': 1}
9878150_gab                           {'Arab': 1, 'Islam': 1, 'None': 1}
9976594_gab                    {'African': 1, 'Caucasian': 1, 'None': 1}
9981407_gab                                                {'Jewish': 1}
9988840_gab                                      {'None': 1, 'Women': 1}
Name: multilabel, Length: 9845, dtype: object

In [363]:
# get labels/mutlilabels
df['label'] = df['annotators'].apply(majority_rule_class)
df['multilabel'] = df['annotators'].apply(majority_rule_target)
# ensure consistency
df['multilabel'] = df['multilabel'].replace({'None':None})

In [336]:
# drop tweets with no agreement 
df = df[df['label'].notnull()]
print(len(df))

9845


In [337]:
# map label
df['label'] = df['label'].replace({'normal': NOT_HATE, 'hatespeech': HATE})
# we map offensive to 1 only if it is targeted to a group (i.e. multilabel.notnull())
df.loc[df[df['multilabel'].notnull()].index, 'label'] = df.loc[df[df['multilabel'].notnull()].index, 'label'].replace({'offensive': HATE})
df['dataset'] = 'd'

# set multilabel to None if not hatespeech
df.loc[df[df['label'] == NOT_HATE].index, 'multilabel'] = None


In [339]:
df['multilabel'].value_counts()

Series([], Name: multilabel, dtype: int64)

In [338]:
df['label'].value_counts()

0            5124
1            2960
offensive    1761
Name: label, dtype: int64

In [90]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hateXplain.csv', index=False)


#  Offense Eval (e)
- Alt title: Predicting the Type and Target of Offensive Posts in Social Media
- only training available

In [91]:
df = pd.read_csv('./new_datasets/Predicting the Type and Target of Offensive Posts in Social Media/offenseval-training-v1.tsv', sep='\t')

In [92]:
# clean text
df['text'] = df['tweet'].apply(preprocess)

In [93]:
# consider hatesppech only cases where subtask_c = GRP, i.e. offensive targeted to group
df['label'] = df.apply(lambda x: HATE if x['subtask_a'] == 'OFF' and x['subtask_c'] == 'GRP' else NOT_HATE, axis=1)

df['multilabel'] = None
df['dataset'] = 'e'

In [94]:
# save dataset 
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/offense_eval.csv', index=False)


# Automated Hate Speech Detection and the Problem of Offensive Language (f)

  - 0 - hate speech
  - 1 - offensive  language
  - 2 - neither

In [95]:
df = pd.read_csv('./new_datasets/Automated Hate Speech Detection and the Problem of Offensive Language/labeled_data.csv')

In [96]:
# clean text
df['text'] = df[f'tweet'].apply(preprocess)

# map labels
# we map offensive to hate too.
df['label'] = df['class'].replace({2:NOT_HATE, 1:HATE, 0:HATE})
df['multilabel'] = None
df['dataset'] = 'f'

In [97]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/automated_hate_speech.csv', index=False)


#  Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter (g)

- HAVE TO SCRAP TWEETS
- overlap with h

In [98]:
df = pd.read_csv('./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016.csv', header=None)
df.columns = ['id', 'label']


In [99]:
len(df)

16907

In [100]:
df = df.drop_duplicates(subset='id')
len(df)

16849

## Get tweets for (g)

In [101]:
tweet_ids = df['id'].values
len(tweet_ids)

16849

In [102]:
# client = Twarc2(bearer_token=BEARER_TOKEN)

# # we loop because we get a 401 error on some ids
# error_ids = []
# step = 100
# for idx in range(0, len(tweet_ids), step):
#     step_ids = tweet_ids[idx:idx+step]
#     #print(idx, step+idx)

#     try:
#         search_results = client.tweet_lookup(step_ids) 
#         file_name = "./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016_raw-tweets.json"

#         # Twarc returns all Tweets for the criteria set above, so we page through the results
#         for page in search_results:
#             # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
#             # so we use expansions.flatten to get all the information in a single JSON
#             result = expansions.flatten(page)
#             # We will open the file and append one JSON object per new line
#             with open(file_name, 'a+') as filehandle:
#                 for tweet in result:
#                     filehandle.write('%s\n' % json.dumps(tweet))
#     except:
#         error_ids.append(step_ids)                    


In [103]:
# len(error_ids)

In [104]:
# read raw tweets
tweets = pd.read_json('/home/dimos/Desktop/phd/hate_speech/new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NAACL_SRW_2016_raw-tweets.json', lines=True)
tweets = tweets[['id', 'text']]

In [105]:
len(tweets)

10014

In [106]:
# map text
tweets = tweets.set_index('id')
df = df.set_index('id')

df['text'] = tweets['text']
df = df.reset_index()

In [107]:
df = df[df['text'].notnull()]

In [108]:
# read tweets from (h) and remove overlap
df_h = pd.read_csv(
    './new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016.csv', sep='\t')
df_h = df_h.index

  df_h = pd.read_csv(


In [109]:
# remove ids present in (h)
df = df[~df['id'].isin(df_h)]
len(df)

4492

In [110]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [111]:
# map labels
df['multilabel'] = df['label']
df['label'] = df['label'].replace({'racism':HATE, 'sexism':HATE, 'none':NOT_HATE})

In [112]:
df['label'].unique()

array([2, 1])

In [113]:
# save dataset
df['dataset'] = 'g'
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hateful-symbols_or_hateful-people.csv', index=False)


# Are You a Racist or Am I Seeing Things? (h)
- Have to scrap tweets
- overlap with (g)

In [114]:
df = pd.read_csv(
    './new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016.csv', sep='\t')


  df = pd.read_csv(


In [115]:
cols = ['TweetID', 'Expert', 'Amateur_0', 'Amateur_1']
df = df[cols]
df = df.reset_index()

df.columns = ['id', 'Expert', 'Amateur_0', 'Amateur_1', 'Amateur_2']

## Getting tweets for (h)

In [116]:
tweet_ids = df['id'].values

In [117]:
# client = Twarc2(bearer_token=BEARER_TOKEN)

# search_results = client.tweet_lookup(tweet_ids) 
# file_name = "./new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016_raw-tweets.json"

# # Twarc returns all Tweets for the criteria set above, so we page through the results
# for page in search_results:
#     # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
#     # so we use expansions.flatten to get all the information in a single JSON
#     result = expansions.flatten(page)
#     # We will open the file and append one JSON object per new line
#     with open(file_name, 'a+') as filehandle:
#         for tweet in result:
#             filehandle.write('%s\n' % json.dumps(tweet))
            


In [118]:
# read tweets scrapped
tweets = pd.read_json('/home/dimos/Desktop/phd/hate_speech/new_datasets/Hateful Symbols or Hateful People? Predictive Features for Hate Speech Detection on Twitter/NLP+CSS_2016_raw-tweets.json', lines=True)
tweets = tweets[['id','text']]

In [119]:
# map text to tweets
df = df.set_index('id')
tweets = tweets.set_index('id')

df['text'] = tweets['text']
df = df.reset_index()

In [120]:
# ignore tweets with no text
df = df[df['text'].notnull()]

In [121]:
# we consider the label provided by the Expert
df['label'] = df['Expert']
df['label'] = df['label'].replace({'neither':NOT_HATE, 'sexism':HATE,
                                    'both':HATE, 'racism':HATE}
                                )
# arbitary map "both" to "racism"
df['multilabel'] = df['Expert'].replace({"both":"racism"})
df['dataset'] = 'h'

In [122]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [123]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/are_you_racist_or.csv', index=False)


# When Does a Compliment Become Sexist? Analysis and Classification of Ambivalent Sexism Using Twitter Data (i)
- included in Call me sexist but (b)
- possible different annotations?
- can be found in either dataset='benevolent' or dataset='other'

In [126]:
# df_i = pd.read_csv('/home/dimos/Desktop/phd/hate_speech/new_datasets/When Does a Compliment Become Sexist/hostile_sexist.tsv')

In [127]:
# df_b[df_b['text'].str.contains('These two are revolting')]

# Overview of the Task on Automatic Misogyny Identification at IberEval 2018 (j)

- datasets are password protected

# Multilingual and Multi-Aspect Hate Speech Analysis (English) (k)

In [128]:
df = pd.read_csv('new_datasets/Multilingual and Multi-Aspect Hate Speech Analysis/en_dataset.csv')

In [129]:
len(df)

5647

In [130]:
df['target'].unique()

array(['origin', 'disability', 'gender', 'sexual_orientation', 'other',
       'religion'], dtype=object)

In [131]:
df['group'].unique()

array(['gay', 'special_needs', 'other', 'women', 'left_wing_people',
       'individual', 'immigrants', 'jews', 'muslims', 'refugees',
       'african_descent', 'indian/hindu', 'hispanics', 'asians',
       'christian', 'arabs'], dtype=object)

In [132]:
# map labels
def get_label(sentiment):
    if "hate" in sentiment:
        label = "hate"
    elif "normal" == sentiment:
        label = "normal"
    else:
        label = None
    return label

df['label'] = df['sentiment'].apply(get_label)

In [133]:
df = df[df['label'].notnull()]
df['multilabel'] = df['target']

# set multilabel to None if not hatespeech
df.loc[df[df['label'] == "normal"].index, 'multilabel'] = None

In [134]:
# normalise text
df['text'] = df['tweet'].apply(preprocess)

In [135]:
# map label to 0,1,-1
df['label']  = df['label'].replace({'normal':NOT_HATE, 'hate':HATE})

df['dataset'] = 'k'

df['label'].value_counts()

2    1278
1     661
Name: label, dtype: int64

In [136]:
# save dataset
df[['dataset', 'text', 'label', 'multilabel']].to_csv(
    './datasets/multilingual_and_multi_aspect.csv', index=False)


# Exploring Hate Speech Detection in Multimodal Publications (l)

- multimodal both text & image


# hatEval, SemEval-2019 Task 5: Multilingual Detection of Hate Speech Against Immigrants and Women in Twitter (English) (m)

In [137]:
def format_file(text_f, label_f):
    with open(text_f) as f:
        text = f.readlines()
        text = [x.strip('\n') for x in text]

    with open(label_f) as f:
        labels = f.readlines()
        labels = [int(x.strip('\n')) for x in labels]

    data = [(i,z) for i,z in zip(text,labels)]

    data = pd.DataFrame(data, columns=['text', 'label'])
    return data

In [138]:
df_train = format_file('./new_datasets/hatEval semEval-2019/train_text.txt', 
            './new_datasets/hatEval semEval-2019/train_labels.txt')
df_train['split'] = "train"
df_test = format_file('./new_datasets/hatEval semEval-2019/test_text.txt', 
            './new_datasets/hatEval semEval-2019/test_labels.txt')
df_test['split'] = "test"
df_val = format_file('./new_datasets/hatEval semEval-2019/val_text.txt', 
            './new_datasets/hatEval semEval-2019/val_labels.txt')
df_val['split'] = "val"

In [139]:
df = pd.concat([df_train, df_test, df_val])

In [140]:
# clean text
df['text'] = df['text'].apply(preprocess)

# map labels
df['label'] = df['label'].replace({0:NOT_HATE, 1:HATE})
df['multilabel'] = None
df['dataset'] = "m"

# save dataset 
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hatEval2019.csv', index=False)

# Overview of the HASOC track at FIRE 2019:Hate Speech and Offensive Content Identification in Indo-European Languages (n)


In [141]:
df  = pd.read_csv('new_datasets/Overview of the HASOC track at FIRE 2019:Hate Speech and Offensive Content Identification in Indo-European Languages/english_dataset.tsv', sep='\t')

In [142]:
# clean text
df['text'] = df['text'].apply(preprocess)

In [143]:
# keep only NONE and HATE
df = df[df['task_2'].isin(['NONE', 'HATE'])]
df['label'] = df['task_2'].replace({'NONE':NOT_HATE, 'HATE':HATE})
df['split'] = 'train'
df['dataset'] = 'n'

In [144]:
# read test data
test = pd.read_csv('new_datasets/Overview of the HASOC track at FIRE 2019:Hate Speech and Offensive Content Identification in Indo-European Languages/hasoc2019_en_test-2919.tsv',sep='\t')

In [145]:
# specify test data
test_ids = test['text_id'].values

df.loc[df[df['text_id'].isin(test_ids)].index, 'split'] = 'test'

In [146]:
df['multilabel'] = None

# save dataset
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/hasoc_fire2019.csv', index=False)

# Detecting East Asian Prejudice on Social Media (o)

In [147]:
df = pd.read_csv('./new_datasets/Detecting East Asian Prejudice on Social Media/hs_AsianPrejudice_20kdataset_cleaned_anonymized.tsv', sep='\t')

In [148]:
#  majority rule
def majority_rule_class(col):
    results_class = defaultdict(lambda : 0)
    for annotator in ['annot1', 'annot2', 'expert']:
        results_class[col[annotator]] += 1

    majority_class = max(results_class, key=results_class.get)

    return majority_class

df['label'] = df.apply(majority_rule_class, axis=1)

In [149]:
# consider only none_of_the_above & entity_directed_hostility
df = df[df['label'].isin(['none_of_the_above', 'entity_directed_hostility'])]

In [150]:
# clean text
df['text'] = df['text'].apply(preprocess)

# map labels
df['label'] = df['label'].replace({'none_of_the_above':NOT_HATE, 'entity_directed_hostility':HATE})

# save dataset 
df['dataset'] = 'o'
df['split'] = None
df['multilabel'] = None
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/east_asian_prejudice.csv', index=False)

# Large Scale Crowdsourcing and Characterization of Twitter Abusive Behavior (p)

In [151]:
df = pd.read_csv('new_datasets/Large Scale Crowdsourcing and Characterization of Twitter Abusive BehaviorLarge Scale Crowdsourcing and Characterization of Twitter Abusive Behavior/hatespeechtwitter.tab', '\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [152]:
df['maj_label'].unique()

array(['abusive', 'normal', 'hateful', 'spam', nan], dtype=object)

In [153]:
# consider only "hateful" and "normal"
df = df[df['maj_label'].isin(['hateful', 'normal'])]

# take all "hateful" and sample the same size of "normal"
hate = df[df['maj_label'] == 'hateful']
normal = df[df['maj_label'] == 'normal']
normal = normal.sample(n=len(hate),random_state=2)

# group new sets
df = pd.concat([hate,normal])

## Get tweets for (p)

In [154]:
tweet_ids = df['tweet_id'].values
len(tweet_ids)

7270

In [155]:
# client = Twarc2(bearer_token=BEARER_TOKEN)

# search_results = client.tweet_lookup(tweet_ids) 
# file_name = "new_datasets/Large Scale Crowdsourcing and Characterization of Twitter Abusive BehaviorLarge Scale Crowdsourcing and Characterization of Twitter Abusive Behavior/hatespeechtwitter_raw-tweets.json"

# # Twarc returns all Tweets for the criteria set above, so we page through the results
# for page in search_results:
#     # The Twitter API v2 returns the Tweet information and the user, media etc.  separately
#     # so we use expansions.flatten to get all the information in a single JSON
#     result = expansions.flatten(page)
#     # We will open the file and append one JSON object per new line
#     with open(file_name, 'a+') as filehandle:
#         for tweet in result:
#             filehandle.write('%s\n' % json.dumps(tweet))
            


In [156]:
# read tweets
tweets = pd.read_json("new_datasets/Large Scale Crowdsourcing and Characterization of Twitter Abusive BehaviorLarge Scale Crowdsourcing and Characterization of Twitter Abusive Behavior/hatespeechtwitter_raw-tweets.json", lines=True)
# drop tweets scrapped twice
tweets = tweets.drop_duplicates(subset='id')
tweets = tweets[['id', 'text']]

In [157]:
# map text to tweets
df = df.set_index('tweet_id')
tweets = tweets.set_index('id')

df['text'] = tweets['text']
df = df.reset_index()

In [158]:
# drop tweets with no texxt
df = df[df['text'].notnull()]
len(df)

2209

In [159]:
# clean text
df['text'] = df['text'].apply(preprocess)

# map labels
df['label'] = df['maj_label'].replace({'hateful':HATE, 'normal':NOT_HATE})
df['multilabel'] = None
df['split'] = None
df['dataset'] = "p"

In [160]:
df['label'].value_counts()

1    1278
2     931
Name: label, dtype: int64

In [161]:
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/large_scale_crowdsourcing.csv', index=False)

# Twitter Sentiment Analysis (q)

Formally, given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist and label '0' denotes the tweet is not racist/sexist, your objective is to predict the labels on the test dataset.

In [162]:
df = pd.read_csv('./new_datasets/Twitter Sentiment Analysis/train.csv')

In [163]:
df['text'] = df['tweet'].apply(preprocess)
df['label'] = df['label'].replace({0: NOT_HATE, 1:HATE})
df['multilabel'] = None
df['split'] = 'train'
df['dataset'] = 'q'

In [164]:
# save dataset
df[['dataset', 'split', 'text', 'label', 'multilabel']].to_csv(
    './datasets/twitter_sentiment_analysis.csv', index=False)