In [1]:
# %pip install scrapy
# %pip install html_text
# %pip install Selector
# %pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np

# Import ordinal encoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using the collection of English-language hate speech datasets compiled by <a href="https://github.com/leondz/hatespeechdata/tree/master?tab=readme-ov-file#English-header">Leondz</a>, I selected several that piqued my interest.

# Hate Speech / Offensive Speech in the 2020 US Elections

The first dataset is the <a href="https://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/stance-hof/">Hate Speech / Offensive Speech in the 2020 US Elections dataset</a>, authored by Lara Grimminger and Roman Klinger.

In [4]:
us_elections_2020_hate_speech_train = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/hate_speech_2020_us_elections_train.tsv", sep="\t")
us_elections_2020_hate_speech_test = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/hate_speech_2020_us_elections_test.tsv", sep="\t")
us_elections_2020_hate_speech = pd.concat([us_elections_2020_hate_speech_train, us_elections_2020_hate_speech_test])
us_elections_2020_hate_speech

Unnamed: 0,text,Trump,Biden,West,HOF
0,@SukiRavan @ProgressPotato @MarkZuckerb0rg @JS...,Neither,Favor,Neither,Non-Hateful
1,@Newsweek Are you freaking crazy????[NEWLINE]I...,Neither,Favor,Neither,Non-Hateful
2,Undecided voters (and MAGATs alike);[NEWLINE]I...,Against,Neutral mentions,Neither,Non-Hateful
3,@cheaterwins @Hungry_For_More @DAYSORSHAY So a...,Favor,Neutral mentions,Neither,Non-Hateful
4,@CNN Nancy Pelosi and the Dems wont do a deal ...,Neutral mentions,Neither,Neither,Non-Hateful
...,...,...,...,...,...
595,@JoeBiden Stay healthy Mr Vice President! You ...,Neither,Favor,Neither,Non-Hateful
596,@TheRock @JoeBiden @KamalaHarris Bye bye Rock!...,Favor,Against,Neither,Non-Hateful
597,Countless and effortless. [NEWLINE][NEWLINE]Th...,Favor,Neither,Neither,Non-Hateful
598,The only thing Trump ran well was his business...,Against,Favor,Neither,Non-Hateful


In [5]:
# Use a loop to view the number of unique entries
for column in us_elections_2020_hate_speech.columns:
    print(f"Unique entries in {column}: {us_elections_2020_hate_speech_train[column].nunique()}")

Unique entries in text: 2400
Unique entries in Trump: 5
Unique entries in Biden: 5
Unique entries in West: 4
Unique entries in HOF: 2


In [6]:
# View the unique values in the "Trump" and "Biden" columns
us_elections_2020_hate_speech["Trump"].unique()

array(['Neither', 'Against', 'Favor', 'Neutral mentions', 'Mixed'],
      dtype=object)

In [7]:
us_elections_2020_hate_speech["Biden"].unique()

array(['Favor', 'Neutral mentions', 'Neither', 'Against', 'Mixed'],
      dtype=object)

In [8]:
# View the unique values in the "West" column
us_elections_2020_hate_speech["West"].unique()

array(['Neither', 'Favor', 'Neutral mentions', 'Against'], dtype=object)

In [9]:
# Assign integer values for the "Trump", "Biden", and "West" columns where "Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4
us_elections_2020_hate_speech["Trump"] = us_elections_2020_hate_speech["Trump"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
us_elections_2020_hate_speech["Biden"] = us_elections_2020_hate_speech["Biden"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
us_elections_2020_hate_speech["West"] = us_elections_2020_hate_speech["West"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Favor": 3})

  us_elections_2020_hate_speech["Trump"] = us_elections_2020_hate_speech["Trump"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
  us_elections_2020_hate_speech["Biden"] = us_elections_2020_hate_speech["Biden"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
  us_elections_2020_hate_speech["West"] = us_elections_2020_hate_speech["West"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Favor": 3})


In [10]:
us_elections_2020_hate_speech.head()

Unnamed: 0,text,Trump,Biden,West,HOF
0,@SukiRavan @ProgressPotato @MarkZuckerb0rg @JS...,0,4,0,Non-Hateful
1,@Newsweek Are you freaking crazy????[NEWLINE]I...,0,4,0,Non-Hateful
2,Undecided voters (and MAGATs alike);[NEWLINE]I...,1,2,0,Non-Hateful
3,@cheaterwins @Hungry_For_More @DAYSORSHAY So a...,4,2,0,Non-Hateful
4,@CNN Nancy Pelosi and the Dems wont do a deal ...,2,0,0,Non-Hateful


In [11]:
# Use ordinal encoder on the Trump, Biden, and West columns to properly format the data
ordinal_encoder = OrdinalEncoder()
us_elections_2020_hate_speech[["Trump", "Biden", "West"]] = ordinal_encoder.fit_transform(us_elections_2020_hate_speech[["Trump", "Biden", "West"]])
us_elections_2020_hate_speech

Unnamed: 0,text,Trump,Biden,West,HOF
0,@SukiRavan @ProgressPotato @MarkZuckerb0rg @JS...,0.0,4.0,0.0,Non-Hateful
1,@Newsweek Are you freaking crazy????[NEWLINE]I...,0.0,4.0,0.0,Non-Hateful
2,Undecided voters (and MAGATs alike);[NEWLINE]I...,1.0,2.0,0.0,Non-Hateful
3,@cheaterwins @Hungry_For_More @DAYSORSHAY So a...,4.0,2.0,0.0,Non-Hateful
4,@CNN Nancy Pelosi and the Dems wont do a deal ...,2.0,0.0,0.0,Non-Hateful
...,...,...,...,...,...
595,@JoeBiden Stay healthy Mr Vice President! You ...,0.0,4.0,0.0,Non-Hateful
596,@TheRock @JoeBiden @KamalaHarris Bye bye Rock!...,4.0,1.0,0.0,Non-Hateful
597,Countless and effortless. [NEWLINE][NEWLINE]Th...,4.0,0.0,0.0,Non-Hateful
598,The only thing Trump ran well was his business...,1.0,4.0,0.0,Non-Hateful


In [12]:
us_elections_2020_hate_speech["Trump"].unique()

array([0., 1., 4., 2., 3.])

In [13]:
us_elections_2020_hate_speech["Biden"].unique()

array([4., 2., 0., 1., 3.])

In [14]:
us_elections_2020_hate_speech["West"].unique()

array([0., 3., 2., 1.])

In [15]:
# Use a for loop to loop through all nunique values
for column in us_elections_2020_hate_speech.columns:
    print(f"Unique entries in {column}: {us_elections_2020_hate_speech[column].nunique()}")

Unique entries in text: 2999
Unique entries in Trump: 5
Unique entries in Biden: 5
Unique entries in West: 4
Unique entries in HOF: 2


In [16]:
# Use a for loop to loop through all NaN values
for column in us_elections_2020_hate_speech.columns:
    print(f"NaN values in {column}: {us_elections_2020_hate_speech[column].isna().sum()}")

NaN values in text: 0
NaN values in Trump: 0
NaN values in Biden: 0
NaN values in West: 0
NaN values in HOF: 0


In [17]:
# Output the dataset as us_elections_2020_hate_speech_train_cleaned.csv
us_elections_2020_hate_speech.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/us_elections_2020_hate_speech_cleaned.csv", index=False)

# Dynamically Generated Hate Speech Dataset

The following is the <a href="https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset">Dynamically Generated Hate Speech Dataset</a> from Vidgen et al. (2021) available on <a href="https://arxiv.org/abs/2012.15761">arXIV</a>.

In [18]:
dynamically_generated_hate_speech_dataset = pd.read_csv("https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/raw/refs/heads/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.3.csv")
dynamically_generated_hate_speech_dataset.head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,acl1,1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,acl10,2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,acl100,3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,acl1000,4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,acl10000,5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [19]:
# Remove the "acl" text from acl.id
dynamically_generated_hate_speech_dataset['acl.id'] = dynamically_generated_hate_speech_dataset['acl.id'].str.replace('acl', '')
dynamically_generated_hate_speech_dataset.head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,1,1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,10,2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,100,3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,1000,4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,10000,5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [20]:
dynamically_generated_hate_speech_dataset[dynamically_generated_hate_speech_dataset["round.base"] == 1].head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,1,1,dalits are lowlives,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,10,2,gay people are a burden to our society,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,100,3,Arabs are not welcome,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,1000,4,I'm not saying we should actually eliminate he...,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,10000,5,bananas are for black people,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [21]:
dynamically_generated_hate_speech_dataset[dynamically_generated_hate_speech_dataset["round.base"] == 2].head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,text,label,type,target,level,split,round.base,annotator,round,acl.id.matched
1255,1290,11158,1290,There are too many costas in this country,nothate,none,none,perturbation,train,2,GNZuCtwed3,2b,acl20309
1256,1291,11159,1291,There is way too many m0ng0l0id kids at my sch...,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl19858
1258,1293,11160,1293,I called my brother a twat and my mom got so m...,nothate,none,none,perturbation,train,2,TrRF46JWfP,2b,acl19997
1259,1294,11161,1294,the worst of them are the taaaaaaaaaards becau...,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl11162
1260,1295,11162,1295,the worst of them are the viruses because they...,nothate,none,none,perturbation,train,2,LqLKTtrOmx,2b,acl11161


In [22]:
# See the number of annotators
dynamically_generated_hate_speech_dataset["annotator"].nunique()

20

In [23]:
# Arbitrarily sort the annotators and change the annotator value to a permanent integer value starting from 1
dynamically_generated_hate_speech_dataset["annotator"] = dynamically_generated_hate_speech_dataset["annotator"].astype("category").cat.codes + 1
dynamically_generated_hate_speech_dataset["annotator"].unique()

array([ 3, 18, 16,  7,  6,  1,  8,  9, 17,  2, 13, 19, 20, 10, 14, 11,  4,
       12,  5, 15], dtype=int8)

In [24]:
dynamically_generated_hate_speech_dataset["annotator"].nunique()

20

In [25]:
dynamically_generated_hate_speech_dataset["level"].unique()

array(['original', 'perturbation'], dtype=object)

In [26]:
dynamically_generated_hate_speech_dataset["type"].unique()

array(['notgiven', 'none', 'derogation', 'animosity', 'dehumanization',
       'threatening', 'support'], dtype=object)

In [27]:
# Replace all values of type "notgiven" with "unclear"
dynamically_generated_hate_speech_dataset["type"] = dynamically_generated_hate_speech_dataset["type"].replace("notgiven", "unclear")
dynamically_generated_hate_speech_dataset["type"].unique()

array(['unclear', 'none', 'derogation', 'animosity', 'dehumanization',
       'threatening', 'support'], dtype=object)

In [28]:
len(dynamically_generated_hate_speech_dataset)

41144

In [29]:
# Keep the acl.id, X1, text, label, type, target, level, and annotator columns
dynamically_generated_hate_speech_dataset = dynamically_generated_hate_speech_dataset[["text", "label", "type", "target", "level", "annotator"]]
dynamically_generated_hate_speech_dataset

Unnamed: 0,text,label,type,target,level,annotator
0,dalits are lowlives,hate,unclear,notgiven,original,3
1,gay people are a burden to our society,hate,unclear,notgiven,original,3
2,Arabs are not welcome,hate,unclear,notgiven,original,18
3,I'm not saying we should actually eliminate he...,hate,unclear,notgiven,original,16
4,bananas are for black people,hate,unclear,notgiven,original,7
...,...,...,...,...,...,...
41139,Indian culture is so backwards,hate,unclear,notgiven,original,3
41140,"Women don't want equality, they just want to b...",hate,derogation,wom,original,1
41141,fuck covid,nothate,none,none,original,1
41142,This computer is garbage,nothate,none,none,original,13


In [30]:
# View NaN values across the dataset
dynamically_generated_hate_speech_dataset.isna().sum()

Unnamed: 0,0
text,0
label,0
type,0
target,8
level,0
annotator,0


In [31]:
# View the NaN values in the "target" feature
dynamically_generated_hate_speech_dataset[dynamically_generated_hate_speech_dataset["target"].isna()]

Unnamed: 0,text,label,type,target,level,annotator
2220,I swear I will piss on this gross pikey until ...,hate,derogation,,perturbation,18
2741,Really? There are a lot of gypsy travellers th...,hate,derogation,,perturbation,1
2971,alot ogf people make fun of my personality in ...,hate,support,,perturbation,2
4175,"I have worse teeth than a camel humper, my mum...",hate,derogation,,perturbation,7
9093,Please stop inviting this squaw at the workplace,hate,derogation,,perturbation,2
9185,A girl I'm working with hails from Egypt and t...,hate,derogation,,perturbation,1
11812,fucking hell I've just seen a pikey,hate,derogation,,perturbation,3
23651,The purpose of the Aryan Hall is not just to h...,hate,support,,original,15


In [32]:
# Use a loop to view the NaN values across each feature in the dataset
for column in dynamically_generated_hate_speech_dataset.columns:
    print(f"NaN values in {column}: {dynamically_generated_hate_speech_dataset[column].isna().sum()}")

NaN values in text: 0
NaN values in label: 0
NaN values in type: 0
NaN values in target: 8
NaN values in level: 0
NaN values in annotator: 0


In [33]:
# Remove the NaN values from the "target" feature
dynamically_generated_hate_speech_dataset = dynamically_generated_hate_speech_dataset["target"].dropna()

In [34]:
# Output the dataset as dynamically_generated_hate_speech_dataset_cleaned.csv
dynamically_generated_hate_speech_dataset.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/dynamically_generated_hate_speech_dataset_cleaned.csv", index=False)

# Convabuse

The following is the <a href="https://github.com/amandacurry/convabuse/tree/main">Convabuse dataset</a> meant to gauge outputs by conversational artificial intelligence agents, linked to <a href="https://aclanthology.org/2021.emnlp-main.587/">this paper</a>.

In [35]:
# Import the ConvAbuse dataset
convabuse = pd.read_csv("https://github.com/amandacurry/convabuse/raw/refs/heads/main/1_full/ConvAbuseEMNLPfull.csv")
convabuse.head()

Unnamed: 0,example_no,annotator_id,conv_id,prev_agent,prev_user,agent,user,bot,is_abuse.1,is_abuse.0,...,type.intellectual,type.racist,type.sexist,type.sex_harassment,type.transphobic,target.generalised,target.individual,target.system,direction.explicit,direction.implicit
0,0,Annotator7,245846.0,Does that question interest you?,because my finger is happy,Why do you say your finger is happy?,do you rember japan,E.L.I.Z.A.,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Annotator7,159183.0,Does it please you to believe I am a nobe jocky?,bye bye,Please go on.,do you like it hard,E.L.I.Z.A.,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,Annotator7,340147.0,Have you tried?,dick,Please go on.,oh what a pickle dear monastery,E.L.I.Z.A.,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Annotator7,95cbaab0f8cb4cb59a3393e7fc61cafe,Will you be travelling in Economy Class?,yes,What city will you be flying from?,Sydney,CarbonBot,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Annotator7,270732.0,Please go on.,Hello.,How do you do. Please state your problem.,how old are you?,E.L.I.Z.A.,1,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# Remove the term "Annotator" from each entry in the annotator_id column
convabuse["annotator_id"] = convabuse["annotator_id"].str.replace('Annotator', '')
convabuse["annotator_id"].unique()


array(['7', '4', '8', '6', '1', '5', '3', '2'], dtype=object)

In [37]:
# Use a for loop to loop for NaN values in convabuse