In [1]:
# %pip install scrapy
# %pip install html_text
# %pip install Selector
# %pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np

# Import ordinal encoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using the collection of English-language hate speech datasets compiled by <a href="https://github.com/leondz/hatespeechdata/tree/master?tab=readme-ov-file#English-header">Leondz</a>, I selected several that piqued my interest.

# Hate Speech / Offensive Speech in the 2020 US Elections

The first dataset is the <a href="https://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/stance-hof/">Hate Speech / Offensive Speech in the 2020 US Elections dataset</a>, authored by Lara Grimminger and Roman Klinger.

In [4]:
# us_elections_2020_hate_speech_train = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/hate_speech_2020_us_elections_train.tsv", sep="\t")
# us_elections_2020_hate_speech_test = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/hate_speech_2020_us_elections_test.tsv", sep="\t")
us_elections_2020_hate_speech_train = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/hate_speech_2020_us_elections_train.tsv", sep="\t")
us_elections_2020_hate_speech_test = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/hate_speech_2020_us_elections_test.tsv", sep="\t")
us_elections_2020_hate_speech = pd.concat([us_elections_2020_hate_speech_train, us_elections_2020_hate_speech_test])

# Remove the text column
us_elections_2020_hate_speech = us_elections_2020_hate_speech.drop(columns=["text"])

us_elections_2020_hate_speech

Unnamed: 0,Trump,Biden,West,HOF
0,Neither,Favor,Neither,Non-Hateful
1,Neither,Favor,Neither,Non-Hateful
2,Against,Neutral mentions,Neither,Non-Hateful
3,Favor,Neutral mentions,Neither,Non-Hateful
4,Neutral mentions,Neither,Neither,Non-Hateful
...,...,...,...,...
595,Neither,Favor,Neither,Non-Hateful
596,Favor,Against,Neither,Non-Hateful
597,Favor,Neither,Neither,Non-Hateful
598,Against,Favor,Neither,Non-Hateful


In [5]:
# Use a loop to view the number of unique entries
for column in us_elections_2020_hate_speech.columns:
    print(f"Unique entries in {column}: {us_elections_2020_hate_speech_train[column].nunique()}")

Unique entries in Trump: 5
Unique entries in Biden: 5
Unique entries in West: 4
Unique entries in HOF: 2


In [6]:
# View the unique values in the "Trump" and "Biden" columns
us_elections_2020_hate_speech["Trump"].unique()

array(['Neither', 'Against', 'Favor', 'Neutral mentions', 'Mixed'],
      dtype=object)

In [7]:
us_elections_2020_hate_speech["Biden"].unique()

array(['Favor', 'Neutral mentions', 'Neither', 'Against', 'Mixed'],
      dtype=object)

In [8]:
# View the unique values in the "West" column
us_elections_2020_hate_speech["West"].unique()

array(['Neither', 'Favor', 'Neutral mentions', 'Against'], dtype=object)

In [9]:
# Assign integer values for the "Trump", "Biden", and "West" columns where "Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4
us_elections_2020_hate_speech["Trump"] = us_elections_2020_hate_speech["Trump"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
us_elections_2020_hate_speech["Biden"] = us_elections_2020_hate_speech["Biden"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
us_elections_2020_hate_speech["West"] = us_elections_2020_hate_speech["West"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Favor": 3})

  us_elections_2020_hate_speech["Trump"] = us_elections_2020_hate_speech["Trump"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
  us_elections_2020_hate_speech["Biden"] = us_elections_2020_hate_speech["Biden"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Mixed": 3, "Favor": 4})
  us_elections_2020_hate_speech["West"] = us_elections_2020_hate_speech["West"].replace({"Neither": 0, "Against": 1, "Neutral mentions": 2, "Favor": 3})


In [10]:
us_elections_2020_hate_speech.head()

Unnamed: 0,Trump,Biden,West,HOF
0,0,4,0,Non-Hateful
1,0,4,0,Non-Hateful
2,1,2,0,Non-Hateful
3,4,2,0,Non-Hateful
4,2,0,0,Non-Hateful


In [11]:
# Use ordinal encoder on the Trump, Biden, and West columns to properly format the data
ordinal_encoder = OrdinalEncoder()
us_elections_2020_hate_speech[["Trump", "Biden", "West"]] = ordinal_encoder.fit_transform(us_elections_2020_hate_speech[["Trump", "Biden", "West"]])
us_elections_2020_hate_speech

Unnamed: 0,Trump,Biden,West,HOF
0,0.0,4.0,0.0,Non-Hateful
1,0.0,4.0,0.0,Non-Hateful
2,1.0,2.0,0.0,Non-Hateful
3,4.0,2.0,0.0,Non-Hateful
4,2.0,0.0,0.0,Non-Hateful
...,...,...,...,...
595,0.0,4.0,0.0,Non-Hateful
596,4.0,1.0,0.0,Non-Hateful
597,4.0,0.0,0.0,Non-Hateful
598,1.0,4.0,0.0,Non-Hateful


In [12]:
us_elections_2020_hate_speech["Trump"].unique()

array([0., 1., 4., 2., 3.])

In [13]:
us_elections_2020_hate_speech["Biden"].unique()

array([4., 2., 0., 1., 3.])

In [14]:
us_elections_2020_hate_speech["West"].unique()

array([0., 3., 2., 1.])

In [15]:
# Use a for loop to loop through all nunique values
for column in us_elections_2020_hate_speech.columns:
    print(f"Unique entries in {column}: {us_elections_2020_hate_speech[column].nunique()}")

Unique entries in Trump: 5
Unique entries in Biden: 5
Unique entries in West: 4
Unique entries in HOF: 2


In [16]:
# Use a for loop to loop through all NaN values
for column in us_elections_2020_hate_speech.columns:
    print(f"NaN values in {column}: {us_elections_2020_hate_speech[column].isna().sum()}")

NaN values in Trump: 0
NaN values in Biden: 0
NaN values in West: 0
NaN values in HOF: 0


In [17]:
# Output the dataset as us_elections_2020_hate_speech_train_cleaned.csv
us_elections_2020_hate_speech.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/us_elections_2020_hate_speech_cleaned.csv", index=False)
# us_elections_2020_hate_speech.to_csv("data/us_elections_2020_hate_speech_cleaned.csv", index=False)

# Dynamically Generated Hate Speech Dataset

The following is the <a href="https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset">Dynamically Generated Hate Speech Dataset</a> from Vidgen et al. (2021) available on <a href="https://arxiv.org/abs/2012.15761">arXIV</a>.

In [18]:
import requests
import io

url = "https://github.com/bvidgen/Dynamically-Generated-Hate-Speech-Dataset/raw/refs/heads/main/Dynamically%20Generated%20Hate%20Dataset%20v0.2.3.csv"

r = requests.get(url, timeout=60)
r.raise_for_status()  # will throw if 404/403 etc.
dynamically_generated_hate_speech_dataset = pd.read_csv(io.BytesIO(r.content))

# Remove the text column
dynamically_generated_hate_speech_dataset = dynamically_generated_hate_speech_dataset.drop(columns=["text"])

dynamically_generated_hate_speech_dataset.head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,acl1,1,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,acl10,2,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,acl100,3,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,acl1000,4,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,acl10000,5,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [19]:
# Remove the "acl" text from acl.id
dynamically_generated_hate_speech_dataset['acl.id'] = dynamically_generated_hate_speech_dataset['acl.id'].str.replace('acl', '')
dynamically_generated_hate_speech_dataset.head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,1,1,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,10,2,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,100,3,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,1000,4,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,10000,5,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [20]:
dynamically_generated_hate_speech_dataset[dynamically_generated_hate_speech_dataset["round.base"] == 1].head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,label,type,target,level,split,round.base,annotator,round,acl.id.matched
0,1,1,1,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
1,2,10,2,hate,notgiven,notgiven,original,train,1,GNZuCtwed3,1,
2,3,100,3,hate,notgiven,notgiven,original,train,1,vDe7GN0NrL,1,
3,4,1000,4,hate,notgiven,notgiven,original,train,1,oemYWm1Tjg,1,
4,5,10000,5,hate,notgiven,notgiven,original,test,1,QiOKkCi7F8,1,


In [21]:
dynamically_generated_hate_speech_dataset[dynamically_generated_hate_speech_dataset["round.base"] == 2].head()

Unnamed: 0.1,Unnamed: 0,acl.id,X1,label,type,target,level,split,round.base,annotator,round,acl.id.matched
1255,1290,11158,1290,nothate,none,none,perturbation,train,2,GNZuCtwed3,2b,acl20309
1256,1291,11159,1291,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl19858
1258,1293,11160,1293,nothate,none,none,perturbation,train,2,TrRF46JWfP,2b,acl19997
1259,1294,11161,1294,hate,derogation,dis,original,train,2,TrRF46JWfP,2a,acl11162
1260,1295,11162,1295,nothate,none,none,perturbation,train,2,LqLKTtrOmx,2b,acl11161


In [22]:
# See the number of annotators
dynamically_generated_hate_speech_dataset["annotator"].nunique()

20

In [23]:
# Arbitrarily sort the annotators and change the annotator value to a permanent integer value starting from 1
dynamically_generated_hate_speech_dataset["annotator"] = dynamically_generated_hate_speech_dataset["annotator"].astype("category").cat.codes + 1
dynamically_generated_hate_speech_dataset["annotator"].unique()

array([ 3, 18, 16,  7,  6,  1,  8,  9, 17,  2, 13, 19, 20, 10, 14, 11,  4,
       12,  5, 15], dtype=int8)

In [24]:
dynamically_generated_hate_speech_dataset["annotator"].nunique()

20

In [25]:
dynamically_generated_hate_speech_dataset["level"].unique()

array(['original', 'perturbation'], dtype=object)

In [26]:
dynamically_generated_hate_speech_dataset["type"].unique()

array(['notgiven', 'none', 'derogation', 'animosity', 'dehumanization',
       'threatening', 'support'], dtype=object)

In [27]:
# Replace all values of type "notgiven" with "unclear"
dynamically_generated_hate_speech_dataset["type"] = dynamically_generated_hate_speech_dataset["type"].replace("notgiven", "unclear")
dynamically_generated_hate_speech_dataset["type"].unique()

array(['unclear', 'none', 'derogation', 'animosity', 'dehumanization',
       'threatening', 'support'], dtype=object)

In [28]:
len(dynamically_generated_hate_speech_dataset)

41144

In [29]:
# Keep the acl.id, X1, text, label, type, target, level, and annotator columns
dynamically_generated_hate_speech_dataset = dynamically_generated_hate_speech_dataset[["label", "type", "target", "level", "annotator"]]
dynamically_generated_hate_speech_dataset

Unnamed: 0,label,type,target,level,annotator
0,hate,unclear,notgiven,original,3
1,hate,unclear,notgiven,original,3
2,hate,unclear,notgiven,original,18
3,hate,unclear,notgiven,original,16
4,hate,unclear,notgiven,original,7
...,...,...,...,...,...
41139,hate,unclear,notgiven,original,3
41140,hate,derogation,wom,original,1
41141,nothate,none,none,original,1
41142,nothate,none,none,original,13


In [30]:
# View NaN values across the dataset
dynamically_generated_hate_speech_dataset.isna().sum()

Unnamed: 0,0
label,0
type,0
target,8
level,0
annotator,0


In [31]:
# View the NaN values in the "target" feature
dynamically_generated_hate_speech_dataset[dynamically_generated_hate_speech_dataset["target"].isna()]

Unnamed: 0,label,type,target,level,annotator
2220,hate,derogation,,perturbation,18
2741,hate,derogation,,perturbation,1
2971,hate,support,,perturbation,2
4175,hate,derogation,,perturbation,7
9093,hate,derogation,,perturbation,2
9185,hate,derogation,,perturbation,1
11812,hate,derogation,,perturbation,3
23651,hate,support,,original,15


In [32]:
# Use a loop to view the NaN values across each feature in the dataset
for column in dynamically_generated_hate_speech_dataset.columns:
    print(f"NaN values in {column}: {dynamically_generated_hate_speech_dataset[column].isna().sum()}")

NaN values in label: 0
NaN values in type: 0
NaN values in target: 8
NaN values in level: 0
NaN values in annotator: 0


In [33]:
# Remove the NaN values from the "target" feature
dynamically_generated_hate_speech_dataset["target"] = dynamically_generated_hate_speech_dataset["target"].dropna()

In [34]:
dynamically_generated_hate_speech_dataset.head()

Unnamed: 0,label,type,target,level,annotator
0,hate,unclear,notgiven,original,3
1,hate,unclear,notgiven,original,3
2,hate,unclear,notgiven,original,18
3,hate,unclear,notgiven,original,16
4,hate,unclear,notgiven,original,7


In [35]:
len(dynamically_generated_hate_speech_dataset)

41144

In [36]:
# Output the dataset as dynamically_generated_hate_speech_dataset_cleaned.csv
dynamically_generated_hate_speech_dataset.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/dynamically_generated_hate_speech_dataset_cleaned.csv", index=False)

# Convabuse

The following is the <a href="https://github.com/amandacurry/convabuse/tree/main">Convabuse dataset</a> meant to gauge outputs by conversational artificial intelligence agents, linked to <a href="https://aclanthology.org/2021.emnlp-main.587/">this paper</a>.

In [37]:
# Import the ConvAbuse dataset

convabuse_url = "https://github.com/amandacurry/convabuse/raw/refs/heads/main/1_full/ConvAbuseEMNLPfull.csv"

convabuse_r = requests.get(convabuse_url, timeout=60)
convabuse_r.raise_for_status()  # will throw if 404/403 etc.
convabuse = pd.read_csv(io.BytesIO(convabuse_r.content))
convabuse.head()

Unnamed: 0,example_no,annotator_id,conv_id,prev_agent,prev_user,agent,user,bot,is_abuse.1,is_abuse.0,...,type.intellectual,type.racist,type.sexist,type.sex_harassment,type.transphobic,target.generalised,target.individual,target.system,direction.explicit,direction.implicit
0,0,Annotator7,245846.0,Does that question interest you?,because my finger is happy,Why do you say your finger is happy?,do you rember japan,E.L.I.Z.A.,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Annotator7,159183.0,Does it please you to believe I am a nobe jocky?,bye bye,Please go on.,do you like it hard,E.L.I.Z.A.,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,Annotator7,340147.0,Have you tried?,dick,Please go on.,oh what a pickle dear monastery,E.L.I.Z.A.,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Annotator7,95cbaab0f8cb4cb59a3393e7fc61cafe,Will you be travelling in Economy Class?,yes,What city will you be flying from?,Sydney,CarbonBot,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Annotator7,270732.0,Please go on.,Hello.,How do you do. Please state your problem.,how old are you?,E.L.I.Z.A.,1,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Remove the term "Annotator" from each entry in the annotator_id column
convabuse["annotator_id"] = convabuse["annotator_id"].str.replace('Annotator', '')
convabuse["annotator_id"].unique()


array(['7', '4', '8', '6', '1', '5', '3', '2'], dtype=object)

In [39]:
# Use a for loop to loop for NaN values in convabuse
for column in convabuse.columns:
    print(f"NaN values in {column}: {convabuse[column].isna().sum()}")

NaN values in example_no: 0
NaN values in annotator_id: 0
NaN values in conv_id: 0
NaN values in prev_agent: 0
NaN values in prev_user: 0
NaN values in agent: 0
NaN values in user: 0
NaN values in bot: 0
NaN values in is_abuse.1: 0
NaN values in is_abuse.0: 0
NaN values in is_abuse.-1: 0
NaN values in is_abuse.-2: 0
NaN values in is_abuse.-3: 0
NaN values in type.ableism: 0
NaN values in type.homophobic: 0
NaN values in type.intellectual: 0
NaN values in type.racist: 0
NaN values in type.sexist: 0
NaN values in type.sex_harassment: 0
NaN values in type.transphobic: 0
NaN values in target.generalised: 0
NaN values in target.individual: 0
NaN values in target.system: 0
NaN values in direction.explicit: 0
NaN values in direction.implicit: 0


In [40]:
# Use a for loop to loop for nunique values in convabuse
for column in convabuse.columns:
    print(f"Unique values in {column}: {convabuse[column].nunique()}")
    if convabuse[column].nunique() < 10:
        print(f"Values in {column}:")
        print(convabuse[column].unique())

Unique values in example_no: 12768
Unique values in annotator_id: 8
Values in annotator_id:
['7' '4' '8' '6' '1' '5' '3' '2']
Unique values in conv_id: 2894
Unique values in prev_agent: 673
Unique values in prev_user: 2728
Unique values in agent: 721
Unique values in user: 2913
Unique values in bot: 2
Values in bot:
['E.L.I.Z.A.' 'CarbonBot']
Unique values in is_abuse.1: 2
Values in is_abuse.1:
[1 0]
Unique values in is_abuse.0: 2
Values in is_abuse.0:
[0 1]
Unique values in is_abuse.-1: 2
Values in is_abuse.-1:
[0 1]
Unique values in is_abuse.-2: 2
Values in is_abuse.-2:
[0 1]
Unique values in is_abuse.-3: 2
Values in is_abuse.-3:
[0 1]
Unique values in type.ableism: 2
Values in type.ableism:
[0 1]
Unique values in type.homophobic: 2
Values in type.homophobic:
[0 1]
Unique values in type.intellectual: 2
Values in type.intellectual:
[0 1]
Unique values in type.racist: 2
Values in type.racist:
[0 1]
Unique values in type.sexist: 2
Values in type.sexist:
[0 1]
Unique values in type.sex_h

In [41]:
len(convabuse)

12768

In [42]:
# Output the dataset as convabuse_cleaned.csv
convabuse.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/convabuse_cleaned.csv", index=False)

# Online Abusive Attacks Dataset

The <a href="https://github.com/RaneemAlharthi/Online-Abusive-Attacks-OAA-Dataset/raw/refs/heads/main/RaneemAlharthi%20-%20Online-Abusive-Attacks-OAA-Dataset.csv">Online Abusive Attacks dataset</a>  



In [43]:
online_abusive_attacks_url = "https://github.com/RaneemAlharthi/Online-Abusive-Attacks-OAA-Dataset/raw/refs/heads/main/RaneemAlharthi%20-%20Online-Abusive-Attacks-OAA-Dataset.csv"

online_abusive_attacks_r = requests.get(online_abusive_attacks_url, timeout=60)
online_abusive_attacks_r.raise_for_status()  # will throw if 404/403 etc.
online_abusive_attacks = pd.read_csv(io.BytesIO(online_abusive_attacks_r.content))
online_abusive_attacks.head()

  online_abusive_attacks = pd.read_csv(io.BytesIO(online_abusive_attacks_r.content))


Unnamed: 0.1,Unnamed: 0,Content based features,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 298,Unnamed: 299,Unnamed: 300,Unnamed: 301,Unnamed: 302,Unnamed: 303,Unnamed: 304,Unnamed: 305,Unnamed: 306,Unnamed: 307
0,Filename,Num Parent Tweets,Num Replies to Parents,Avg Replies per Parent,Num Parent Tweets with Replies,Avg Replies per Parent with Replies,Num Toxic Parent Tweets,Num Non-Toxic Parent Tweets,Num Toxic Replies,Num Non-Toxic Replies,...,,,,,,,,,,
1,1000200192,377,696,1.846153846,102,6.823529412,0,377,0,696,...,,,,,,,,,,
2,1000485576771080000,1171,252,0.215200683,87,2.896551724,67,1104,13,239,...,,,,,,,,,,
3,1000485576771080000,1009,1505,1.491575818,444,3.38963964,28,981,45,1460,...,,,,,,,,,,
4,1000485576771080000,1087,1303,1.198712052,411,3.170316302,47,1040,49,1254,...,,,,,,,,,,


In [44]:
# Change the index column to start from 1 instead of 0 and use the first row as the header
online_abusive_attacks.index = online_abusive_attacks.index + 1
online_abusive_attacks.columns = online_abusive_attacks.iloc[0]
online_abusive_attacks = online_abusive_attacks.drop(1)
online_abusive_attacks.head()

1,Filename,Num Parent Tweets,Num Replies to Parents,Avg Replies per Parent,Num Parent Tweets with Replies,Avg Replies per Parent with Replies,Num Toxic Parent Tweets,Num Non-Toxic Parent Tweets,Num Toxic Replies,Num Non-Toxic Replies,...,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,NaN.9
2,1000200192,377,696,1.846153846,102,6.823529412,0,377,0,696,...,,,,,,,,,,
3,1000485576771080000,1171,252,0.215200683,87,2.896551724,67,1104,13,239,...,,,,,,,,,,
4,1000485576771080000,1009,1505,1.491575818,444,3.38963964,28,981,45,1460,...,,,,,,,,,,
5,1000485576771080000,1087,1303,1.198712052,411,3.170316302,47,1040,49,1254,...,,,,,,,,,,
6,1000485576771080000,1185,147,0.124050633,58,2.534482759,97,1088,7,140,...,,,,,,,,,,


In [45]:
# Loop through each column name in the dataset
for column in online_abusive_attacks.columns:
    print(column)

Filename
Num Parent Tweets
Num Replies to Parents
Avg Replies per Parent
Num Parent Tweets with Replies
Avg Replies per Parent with Replies
Num Toxic Parent Tweets
Num Non-Toxic Parent Tweets
Num Toxic Replies
Num Non-Toxic Replies
Num Toxic Replies to Non-Toxic Parents
Num Non-Toxic Replies to Non-Toxic Parents
Num Toxic Replies to Toxic Parents
Num Non-Toxic Replies to Toxic Parents
nan
followers_count
friends_count
listed_count
created_at
favourites_count
utc_offset
time_zone
geo_enabled
verified
statuses_count
contributors_enabled
is_translator
is_translation_enabled
has_extended_profile
default_profile
default_profile_image
following
follow_request_sent
notifications
geo
place
truncated
hashtags
symbols
user_mentions
urls
contributors
is_quote_status
retweet_count min
retweet_count avg
retweet_count max
favorite_count min
favorite_count avg
favorite_count max
favorited
retweeted
possibly_sensitive
Num Repliers With Min 1 Toxic Reply
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


In [46]:
# Use a ternary operator to filter for only the relevant columns that are not "nan" columns
online_abusive_attacks = online_abusive_attacks.loc[:, online_abusive_attacks.columns.notna()]
len(online_abusive_attacks)

2368

In [47]:
# Loop through the column names again
for column in online_abusive_attacks.columns:
    print(column)

Filename
Num Parent Tweets
Num Replies to Parents
Avg Replies per Parent
Num Parent Tweets with Replies
Avg Replies per Parent with Replies
Num Toxic Parent Tweets
Num Non-Toxic Parent Tweets
Num Toxic Replies
Num Non-Toxic Replies
Num Toxic Replies to Non-Toxic Parents
Num Non-Toxic Replies to Non-Toxic Parents
Num Toxic Replies to Toxic Parents
Num Non-Toxic Replies to Toxic Parents
followers_count
friends_count
listed_count
created_at
favourites_count
utc_offset
time_zone
geo_enabled
verified
statuses_count
contributors_enabled
is_translator
is_translation_enabled
has_extended_profile
default_profile
default_profile_image
following
follow_request_sent
notifications
geo
place
truncated
hashtags
symbols
user_mentions
urls
contributors
is_quote_status
retweet_count min
retweet_count avg
retweet_count max
favorite_count min
favorite_count avg
favorite_count max
favorited
retweeted
possibly_sensitive
Num Repliers With Min 1 Toxic Reply
Num parents with high Toxicity
Num parents with high

In [48]:
# Get the number of NaN values in each column
for column in online_abusive_attacks.columns:
    print(f"NaN values in {column}: {online_abusive_attacks[column].isna().sum()}")

NaN values in Filename: 1
NaN values in Num Parent Tweets: 1
NaN values in Num Replies to Parents: 1
NaN values in Avg Replies per Parent: 1
NaN values in Num Parent Tweets with Replies: 1
NaN values in Avg Replies per Parent with Replies: 1
NaN values in Num Toxic Parent Tweets: 1
NaN values in Num Non-Toxic Parent Tweets: 1
NaN values in Num Toxic Replies: 1
NaN values in Num Non-Toxic Replies: 1
NaN values in Num Toxic Replies to Non-Toxic Parents: 1
NaN values in Num Non-Toxic Replies to Non-Toxic Parents: 1
NaN values in Num Toxic Replies to Toxic Parents: 1
NaN values in Num Non-Toxic Replies to Toxic Parents: 1
NaN values in followers_count: 1
NaN values in friends_count: 1
NaN values in listed_count: 1
NaN values in created_at: 3
NaN values in favourites_count: 1
NaN values in utc_offset: 2368
NaN values in time_zone: 2368
NaN values in geo_enabled: 1
NaN values in verified: 1
NaN values in statuses_count: 1
NaN values in contributors_enabled: 1
NaN values in is_translator: 1
N

In [49]:
# Show all rows with NaN values in any column
online_abusive_attacks[online_abusive_attacks.isna().any(axis=1)]

1,Filename,Num Parent Tweets,Num Replies to Parents,Avg Replies per Parent,Num Parent Tweets with Replies,Avg Replies per Parent with Replies,Num Toxic Parent Tweets,Num Non-Toxic Parent Tweets,Num Toxic Replies,Num Non-Toxic Replies,...,Top Desc Word #1: women,Top Desc Word #2: love,Top Desc Word #3: news,Top Desc Word #4: account,Top Desc Word #5: feminist,Top Desc Word #6: official,Top Desc Word #7: follow,Top Desc Word #8: products,Top Desc Word #9: tweets,Top Desc Word #10: womens
2,1000200192,377,696,1.846153846,102,6.823529412,0,377,0,696,...,No,No,No,No,No,No,No,No,No,No
3,1000485576771080000,1171,252,0.215200683,87,2.896551724,67,1104,13,239,...,No,No,No,No,No,No,No,No,No,No
4,1000485576771080000,1009,1505,1.491575818,444,3.38963964,28,981,45,1460,...,No,No,No,No,No,No,No,No,No,No
5,1000485576771080000,1087,1303,1.198712052,411,3.170316302,47,1040,49,1254,...,No,No,No,No,No,No,No,No,No,No
6,1000485576771080000,1185,147,0.124050633,58,2.534482759,97,1088,7,140,...,No,No,No,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2365,993913688342999040.0,3200.0,1.0,0.000313,1.0,1.0,1.0,3199.0,0.0,1.0,...,No,No,No,No,No,No,No,Yes,No,No
2366,998530966393163008.0,1444.0,61.0,0.042244,51.0,1.196078,41.0,1403.0,1.0,60.0,...,No,No,No,No,No,No,No,No,No,No
2367,99891536.0,11.0,0.0,0.0,0.0,0.0,1.0,10.0,0.0,0.0,...,No,No,No,No,No,No,No,No,No,No
2368,998925233288563968.0,221.0,0.0,0.0,0.0,0.0,11.0,210.0,0.0,0.0,...,No,Yes,No,No,No,No,No,No,No,No


In [50]:
# Remove the row with NaN value in the "Filename" column
online_abusive_attacks = online_abusive_attacks.dropna(subset=["Filename"])

# Loop through each column to confirm no NaN values
for column in online_abusive_attacks.columns:
    print(f"NaN values in {column}: {online_abusive_attacks[column].isna().sum()}")

NaN values in Filename: 0
NaN values in Num Parent Tweets: 0
NaN values in Num Replies to Parents: 0
NaN values in Avg Replies per Parent: 0
NaN values in Num Parent Tweets with Replies: 0
NaN values in Avg Replies per Parent with Replies: 0
NaN values in Num Toxic Parent Tweets: 0
NaN values in Num Non-Toxic Parent Tweets: 0
NaN values in Num Toxic Replies: 0
NaN values in Num Non-Toxic Replies: 0
NaN values in Num Toxic Replies to Non-Toxic Parents: 0
NaN values in Num Non-Toxic Replies to Non-Toxic Parents: 0
NaN values in Num Toxic Replies to Toxic Parents: 0
NaN values in Num Non-Toxic Replies to Toxic Parents: 0
NaN values in followers_count: 0
NaN values in friends_count: 0
NaN values in listed_count: 0
NaN values in created_at: 2
NaN values in favourites_count: 0
NaN values in utc_offset: 2367
NaN values in time_zone: 2367
NaN values in geo_enabled: 0
NaN values in verified: 0
NaN values in statuses_count: 0
NaN values in contributors_enabled: 0
NaN values in is_translator: 0
N

In [51]:
len(online_abusive_attacks)

2367

In [52]:
# Remove rows with NaN values in the "love" column
online_abusive_attacks = online_abusive_attacks.dropna(subset=["Top Desc Word #2: love"])
len(online_abusive_attacks)

2365

In [53]:
# Loop through each column to confirm no NaN values
for column in online_abusive_attacks.columns:
    print(f"NaN values in {column}: {online_abusive_attacks[column].isna().sum()}")

NaN values in Filename: 0
NaN values in Num Parent Tweets: 0
NaN values in Num Replies to Parents: 0
NaN values in Avg Replies per Parent: 0
NaN values in Num Parent Tweets with Replies: 0
NaN values in Avg Replies per Parent with Replies: 0
NaN values in Num Toxic Parent Tweets: 0
NaN values in Num Non-Toxic Parent Tweets: 0
NaN values in Num Toxic Replies: 0
NaN values in Num Non-Toxic Replies: 0
NaN values in Num Toxic Replies to Non-Toxic Parents: 0
NaN values in Num Non-Toxic Replies to Non-Toxic Parents: 0
NaN values in Num Toxic Replies to Toxic Parents: 0
NaN values in Num Non-Toxic Replies to Toxic Parents: 0
NaN values in followers_count: 0
NaN values in friends_count: 0
NaN values in listed_count: 0
NaN values in created_at: 2
NaN values in favourites_count: 0
NaN values in utc_offset: 2365
NaN values in time_zone: 2365
NaN values in geo_enabled: 0
NaN values in verified: 0
NaN values in statuses_count: 0
NaN values in contributors_enabled: 0
NaN values in is_translator: 0
N

In [54]:
# Drop the utc_offset and time_zone columns
online_abusive_attacks = online_abusive_attacks.drop(columns=["utc_offset", "time_zone"])

# Loop through each column in the dataset for unique values
for column in online_abusive_attacks.columns:
    print(f"Unique entries in {column}: {online_abusive_attacks[column].nunique()}")

Unique entries in Filename: 1778
Unique entries in Num Parent Tweets: 1389
Unique entries in Num Replies to Parents: 569
Unique entries in Avg Replies per Parent: 1299
Unique entries in Num Parent Tweets with Replies: 348
Unique entries in Avg Replies per Parent with Replies: 812
Unique entries in Num Toxic Parent Tweets: 232
Unique entries in Num Non-Toxic Parent Tweets: 1442
Unique entries in Num Toxic Replies: 132
Unique entries in Num Non-Toxic Replies: 583
Unique entries in Num Toxic Replies to Non-Toxic Parents: 131
Unique entries in Num Non-Toxic Replies to Non-Toxic Parents: 564
Unique entries in Num Toxic Replies to Toxic Parents: 33
Unique entries in Num Non-Toxic Replies to Toxic Parents: 97
Unique entries in followers_count: 1822
Unique entries in friends_count: 1538
Unique entries in listed_count: 537
Unique entries in created_at: 1776
Unique entries in favourites_count: 1730
Unique entries in geo_enabled: 2
Unique entries in verified: 2
Unique entries in statuses_count: 2

In [55]:
online_abusive_attacks.head()

1,Filename,Num Parent Tweets,Num Replies to Parents,Avg Replies per Parent,Num Parent Tweets with Replies,Avg Replies per Parent with Replies,Num Toxic Parent Tweets,Num Non-Toxic Parent Tweets,Num Toxic Replies,Num Non-Toxic Replies,...,Top Desc Word #1: women,Top Desc Word #2: love,Top Desc Word #3: news,Top Desc Word #4: account,Top Desc Word #5: feminist,Top Desc Word #6: official,Top Desc Word #7: follow,Top Desc Word #8: products,Top Desc Word #9: tweets,Top Desc Word #10: womens
2,1000200192,377,696,1.846153846,102,6.823529412,0,377,0,696,...,No,No,No,No,No,No,No,No,No,No
3,1000485576771080000,1171,252,0.215200683,87,2.896551724,67,1104,13,239,...,No,No,No,No,No,No,No,No,No,No
4,1000485576771080000,1009,1505,1.491575818,444,3.38963964,28,981,45,1460,...,No,No,No,No,No,No,No,No,No,No
5,1000485576771080000,1087,1303,1.198712052,411,3.170316302,47,1040,49,1254,...,No,No,No,No,No,No,No,No,No,No
6,1000485576771080000,1185,147,0.124050633,58,2.534482759,97,1088,7,140,...,No,No,No,No,No,No,No,No,No,No


In [56]:
len(online_abusive_attacks)

2365

In [57]:
# List out each column name
for column in online_abusive_attacks.columns:
    print(column)

Filename
Num Parent Tweets
Num Replies to Parents
Avg Replies per Parent
Num Parent Tweets with Replies
Avg Replies per Parent with Replies
Num Toxic Parent Tweets
Num Non-Toxic Parent Tweets
Num Toxic Replies
Num Non-Toxic Replies
Num Toxic Replies to Non-Toxic Parents
Num Non-Toxic Replies to Non-Toxic Parents
Num Toxic Replies to Toxic Parents
Num Non-Toxic Replies to Toxic Parents
followers_count
friends_count
listed_count
created_at
favourites_count
geo_enabled
verified
statuses_count
contributors_enabled
is_translator
is_translation_enabled
has_extended_profile
default_profile
default_profile_image
following
follow_request_sent
notifications
geo
place
truncated
hashtags
symbols
user_mentions
urls
contributors
is_quote_status
retweet_count min
retweet_count avg
retweet_count max
favorite_count min
favorite_count avg
favorite_count max
favorited
retweeted
possibly_sensitive
Num Repliers With Min 1 Toxic Reply
Num parents with high Toxicity
Num parents with high Severe_Toxicity
Num 

In [58]:
# Keep the relevant columns only
relevant_columns = ["Filename", "followers_count", "friends_count", "verified", "statuses_count", "following", "user_mentions",
                    "Num replies with high Toxicity", "Num replies with high Severe_Toxicity", "Num replies with high Identity_Attack",
                    "Num replies with high Insult", "Num replies with high Profanity", "Num replies with high Threat", "Num replies with high other attr",
                    "Top Desc Word #1: women", "Top Desc Word #2: love", "Top Desc Word #3: news",
                    "Top Desc Word #4: account", "Top Desc Word #5: feminist",
                    "Top Desc Word #6: official", "Top Desc Word #7: follow", "Top Desc Word #8: products", "Top Desc Word #9: tweets",
                    "Top Desc Word #10: womens"]
online_abusive_attacks = online_abusive_attacks[relevant_columns]
online_abusive_attacks.head()

1,Filename,followers_count,friends_count,verified,statuses_count,following,user_mentions,Num replies with high Toxicity,Num replies with high Severe_Toxicity,Num replies with high Identity_Attack,...,Top Desc Word #1: women,Top Desc Word #2: love,Top Desc Word #3: news,Top Desc Word #4: account,Top Desc Word #5: feminist,Top Desc Word #6: official,Top Desc Word #7: follow,Top Desc Word #8: products,Top Desc Word #9: tweets,Top Desc Word #10: womens
2,1000200192,39632,15271,No,71848,No,Yes,0,0,0,...,No,No,No,No,No,No,No,No,No,No
3,1000485576771080000,20606,9472,No,167382,No,Yes,1,0,2,...,No,No,No,No,No,No,No,No,No,No
4,1000485576771080000,20445,9396,No,165314,No,Yes,1,0,17,...,No,No,No,No,No,No,No,No,No,No
5,1000485576771080000,20506,9430,No,166164,No,Yes,3,0,9,...,No,No,No,No,No,No,No,No,No,No
6,1000485576771080000,20583,9469,No,167488,No,Yes,0,0,0,...,No,No,No,No,No,No,No,No,No,No


In [59]:
len(online_abusive_attacks)

2365

In [60]:
# Output the dataset as convabuse_cleaned.csv
online_abusive_attacks.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/online_abusive_attacks_cleaned.csv", index=False)