In [1]:
# Import pandas, numpy, sklearn, matplotlib, and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import resample

# Import Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
convabuse = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/convabuse_cleaned1.csv")
dynamically_generated_hate_speech = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/dynamically_generated_hate_speech_dataset_cleaned1.csv")
online_abusive_attacks = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/online_abusive_attacks_cleaned1.csv")
us_elections_2020_hate_speech = pd.read_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/us_elections_2020_hate_speech_cleaned1.csv")

# Convabuse Dataset

In [3]:
# Preview convabuse
convabuse.head()

Unnamed: 0,example_no,annotator_id,bot,Not Abusive,Ambigious,Abusive,Very Abusive,Very Strongly Abusive,ableism,homophobic,intellectual,racist,sexist,sex_harassment,transphobic,generalised,individual,system,explicit,implicit
0,0,7,E.L.I.Z.A.,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,7,E.L.I.Z.A.,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,7,E.L.I.Z.A.,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,7,CarbonBot,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,7,E.L.I.Z.A.,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Create an "abuse_level" column with default values of 0
convabuse['abuse_level'] = 0

In [5]:
convabuse.head()

Unnamed: 0,example_no,annotator_id,bot,Not Abusive,Ambigious,Abusive,Very Abusive,Very Strongly Abusive,ableism,homophobic,...,racist,sexist,sex_harassment,transphobic,generalised,individual,system,explicit,implicit,abuse_level
0,0,7,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,E.L.I.Z.A.,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,7,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,7,CarbonBot,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,7,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Reassign the abuse_level column
not_abusive = convabuse[convabuse['Not Abusive'] == 1]
not_abusive['abuse_level'] = 0
ambigious = convabuse[convabuse['Ambigious'] == 1]
ambigious['abuse_level'] = 1
abusive = convabuse[convabuse['Abusive'] == 1]
abusive['abuse_level'] = 2
very_abusive = convabuse[convabuse['Very Abusive'] == 1]
very_abusive['abuse_level'] = 3
very_strongly_abusive = convabuse[convabuse['Very Strongly Abusive'] == 1]
very_strongly_abusive['abuse_level'] = 4

# Merge all rows and sort by example_no
convabuse = pd.concat([not_abusive, ambigious, abusive, very_abusive, very_strongly_abusive])
convabuse = convabuse.sort_values(by='example_no')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_abusive['abuse_level'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ambigious['abuse_level'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abusive['abuse_level'] = 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [7]:
convabuse.tail(20)

Unnamed: 0,example_no,annotator_id,bot,Not Abusive,Ambigious,Abusive,Very Abusive,Very Strongly Abusive,ableism,homophobic,...,racist,sexist,sex_harassment,transphobic,generalised,individual,system,explicit,implicit,abuse_level
12748,12748,7,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12749,12749,4,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12750,12750,4,E.L.I.Z.A.,0,0,0,0,1,0,0,...,0,1,1,0,0,0,1,1,0,4
12751,12751,4,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12752,12752,7,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12753,12753,4,CarbonBot,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12754,12754,4,CarbonBot,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12755,12755,7,E.L.I.Z.A.,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12756,12756,5,E.L.I.Z.A.,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,1,3
12757,12757,4,E.L.I.Z.A.,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,3


In [8]:
# Drop the abuse level columns as they are no longer needed
convabuse = convabuse.drop(columns=['Not Abusive', 'Ambigious', 'Abusive', 'Very Abusive', 'Very Strongly Abusive'])
convabuse.head()

Unnamed: 0,example_no,annotator_id,bot,ableism,homophobic,intellectual,racist,sexist,sex_harassment,transphobic,generalised,individual,system,explicit,implicit,abuse_level
0,0,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,7,CarbonBot,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
convabuse["abuse_level"].value_counts()

Unnamed: 0_level_0,count
abuse_level,Unnamed: 1_level_1
0,10068
3,934
2,813
1,671
4,282


In [10]:
convabuse_imbalanced = convabuse

In [11]:
# Resample 0 as the majority class and 1, 2, 3, and 4 as the minority classes in the abuse_level column

convabuse_majority_y = convabuse[convabuse["abuse_level"] == 0]
convabuse_minority_y1 = convabuse[convabuse["abuse_level"] == 1]
convabuse_minority_y2 = convabuse[convabuse["abuse_level"] == 2]
convabuse_minority_y3 = convabuse[convabuse["abuse_level"] == 3]
convabuse_minority_y4 = convabuse[convabuse["abuse_level"] == 4]

# Upsample the minority classes
convabuse_minority_y1_upsampled = resample(convabuse_minority_y1,random_state=42,n_samples=(len(convabuse_majority_y)),replace=True)
convabuse_minority_y2_upsampled = resample(convabuse_minority_y2,random_state=42,n_samples=(len(convabuse_majority_y)),replace=True)
convabuse_minority_y3_upsampled = resample(convabuse_minority_y3,random_state=42,n_samples=(len(convabuse_majority_y)),replace=True)
convabuse_minority_y4_upsampled = resample(convabuse_minority_y4,random_state=42,n_samples=(len(convabuse_majority_y)),replace=True)

In [12]:
#concatenate the upsampled dataframe
convabuse_upsampled = pd.concat([convabuse_majority_y, convabuse_minority_y1_upsampled, convabuse_minority_y2_upsampled, convabuse_minority_y3_upsampled, convabuse_minority_y4_upsampled]).reset_index()
convabuse_upsampled

Unnamed: 0,index,example_no,annotator_id,bot,ableism,homophobic,intellectual,racist,sexist,sex_harassment,transphobic,generalised,individual,system,explicit,implicit,abuse_level
0,0,0,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,2,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,3,7,CarbonBot,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,4,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,5,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50335,11737,11737,4,E.L.I.Z.A.,0,0,0,0,0,1,0,0,0,1,1,0,4
50336,11031,11031,6,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,1,1,0,4
50337,9928,9928,5,CarbonBot,0,0,0,0,0,1,0,0,0,1,1,0,4
50338,6770,6770,5,E.L.I.Z.A.,0,0,0,0,1,0,0,0,0,1,1,0,4


In [13]:
convabuse_upsampled["abuse_level"].value_counts()

Unnamed: 0_level_0,count
abuse_level,Unnamed: 1_level_1
0,10068
1,10068
2,10068
3,10068
4,10068


In [14]:
convabuse_upsampled = convabuse_upsampled.drop(columns=['index'])

In [15]:
convabuse_upsampled["abuse_level"].unique()

array([0, 1, 2, 3, 4])

In [16]:
convabuse_upsampled.head()

Unnamed: 0,example_no,annotator_id,bot,ableism,homophobic,intellectual,racist,sexist,sex_harassment,transphobic,generalised,individual,system,explicit,implicit,abuse_level
0,0,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,7,CarbonBot,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,7,E.L.I.Z.A.,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
convabuse_upsampled.shape

(50340, 16)

In [18]:
convabuse = convabuse_upsampled

In [19]:
# Output the convabuse dataframe as a csv file
convabuse.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/convabuse_final.csv", index=False)
convabuse_imbalanced.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/convabuse_imbalanced.csv", index=False)

# Dynamically Generated Hate Speech Dataset

In [20]:
# Preview dynamically_generated_hate_speech
dynamically_generated_hate_speech.head()

Unnamed: 0,label,type,level,annotator,race_or_ethnicity,gender,religion,lgbtq,nationality,age,disability,class,none
0,1.0,0,original,3,0,0,0,0,0,0,0,0,1
1,1.0,0,original,3,0,0,0,0,0,0,0,0,1
2,1.0,0,original,18,0,0,0,0,0,0,0,0,1
3,1.0,0,original,16,0,0,0,0,0,0,0,0,1
4,1.0,0,original,7,0,0,0,0,0,0,0,0,1


In [21]:
dynamically_generated_hate_speech["type"].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
0,26079
-1,14858
1,89


In [22]:
dynamically_generated_hate_speech["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1.0,22057
0.0,18969


In [23]:
dynamically_generated_hate_speech_imbalanced = dynamically_generated_hate_speech

In [24]:
# Resample 0 as the majority class and 1 as the minority class in the label column

#set the minority class to a separate dataframe
dynamically_generated_hate_speech_minority_y = dynamically_generated_hate_speech[dynamically_generated_hate_speech['label'] == 1]
#set other classes to another dataframe
dynamically_generated_hate_speech_majority_y = dynamically_generated_hate_speech[dynamically_generated_hate_speech['label'] == 0]
#upsample the minority class
dynamically_generated_hate_speech_minority_y_upsampled = resample(dynamically_generated_hate_speech_minority_y,random_state=42,n_samples=(len(dynamically_generated_hate_speech_majority_y)),replace=True)

In [25]:
#concatenate the upsampled dataframe
dynamically_generated_hate_speech_upsampled = pd.concat([dynamically_generated_hate_speech_minority_y_upsampled, dynamically_generated_hate_speech_majority_y]).reset_index()
dynamically_generated_hate_speech_upsampled

Unnamed: 0,index,label,type,level,annotator,race_or_ethnicity,gender,religion,lgbtq,nationality,age,disability,class,none
0,30605,1.0,0,original,16,0,0,0,0,0,0,0,0,1
1,1350,1.0,-1,original,9,0,0,0,0,0,0,1,0,0
2,10434,1.0,-1,original,16,1,0,0,0,0,0,0,0,0
3,40283,1.0,0,original,2,0,0,0,0,0,0,0,0,1
4,23179,1.0,-1,original,17,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37933,41000,0.0,0,original,1,0,0,0,0,0,0,0,0,1
37934,41016,0.0,0,original,13,0,0,0,0,0,0,0,0,1
37935,41020,0.0,0,original,1,0,0,0,0,0,0,0,0,1
37936,41023,0.0,0,original,1,0,0,0,0,0,0,0,0,1


In [26]:
dynamically_generated_hate_speech_upsampled['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1.0,18969
0.0,18969


In [27]:
dynamically_generated_hate_speech_upsampled = dynamically_generated_hate_speech_upsampled.drop(columns=['index'])

In [28]:
dynamically_generated_hate_speech_upsampled.head()

Unnamed: 0,label,type,level,annotator,race_or_ethnicity,gender,religion,lgbtq,nationality,age,disability,class,none
0,1.0,0,original,16,0,0,0,0,0,0,0,0,1
1,1.0,-1,original,9,0,0,0,0,0,0,1,0,0
2,1.0,-1,original,16,1,0,0,0,0,0,0,0,0
3,1.0,0,original,2,0,0,0,0,0,0,0,0,1
4,1.0,-1,original,17,1,0,0,0,0,0,0,0,0


In [29]:
dynamically_generated_hate_speech_upsampled["level"].unique()

array(['original', 'perturbation'], dtype=object)

In [30]:
# Rename level to original, then reassign 1 to "original" values and 0 to "perturbation"
dynamically_generated_hate_speech_upsampled["original"] = dynamically_generated_hate_speech_upsampled["level"]
dynamically_generated_hate_speech_imbalanced["original"] = dynamically_generated_hate_speech_imbalanced["level"]

In [31]:
# Replace "original" with 1 and "perturbation" with 0
dynamically_generated_hate_speech_upsampled["original"] = dynamically_generated_hate_speech_upsampled["original"].replace({"original": 1, "perturbation": 0})
dynamically_generated_hate_speech_imbalanced["original"] = dynamically_generated_hate_speech_imbalanced["original"].replace({"original": 1, "perturbation": 0})

  dynamically_generated_hate_speech_upsampled["original"] = dynamically_generated_hate_speech_upsampled["original"].replace({"original": 1, "perturbation": 0})
  dynamically_generated_hate_speech_imbalanced["original"] = dynamically_generated_hate_speech_imbalanced["original"].replace({"original": 1, "perturbation": 0})


In [32]:
dynamically_generated_hate_speech_upsampled["original"].unique()

array([1, 0])

In [33]:
# Drop the level column
dynamically_generated_hate_speech_upsampled = dynamically_generated_hate_speech_upsampled.drop(columns=['level'])
dynamically_generated_hate_speech_imbalanced = dynamically_generated_hate_speech_imbalanced.drop(columns=['level'])

In [34]:
dynamically_generated_hate_speech_upsampled.head()

Unnamed: 0,label,type,annotator,race_or_ethnicity,gender,religion,lgbtq,nationality,age,disability,class,none,original
0,1.0,0,16,0,0,0,0,0,0,0,0,1,1
1,1.0,-1,9,0,0,0,0,0,0,1,0,0,1
2,1.0,-1,16,1,0,0,0,0,0,0,0,0,1
3,1.0,0,2,0,0,0,0,0,0,0,0,1,1
4,1.0,-1,17,1,0,0,0,0,0,0,0,0,1


In [35]:
dynamically_generated_hate_speech = dynamically_generated_hate_speech_upsampled

In [36]:
# Output dynamically_generated_hate_speech to a csv file
dynamically_generated_hate_speech.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/dynamically_generated_hate_speech_dataset_final.csv", index=False)
dynamically_generated_hate_speech_imbalanced.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/dynamically_generated_hate_speech_dataset_imbalanced.csv", index=False)

# Online Abusive Attacks

In [37]:
online_abusive_attacks.head()

Unnamed: 0,verified,high Identity_Attack,high Insult,high Profanity,high Threat,high other attr,Toxicity
0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2.0,0.0,10.0,0.0,0.0,1.0
2,0,17.0,1.0,25.0,1.0,0.0,1.0
3,0,9.0,2.0,33.0,2.0,0.0,3.0
4,0,0.0,0.0,7.0,0.0,0.0,0.0


In [38]:
# Output online_abusive_attacks_hate_speech to a csv file
online_abusive_attacks.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/online_abusive_attacks_final.csv", index=False)

# Hate Speech 2020 US Elections Dataset

In [39]:
us_elections_2020_hate_speech.head()

Unnamed: 0,Trump,Biden,HOF
0,0.0,4.0,Non-Hateful
1,0.0,4.0,Non-Hateful
2,1.0,2.0,Non-Hateful
3,4.0,2.0,Non-Hateful
4,2.0,0.0,Non-Hateful


In [40]:
us_elections_2020_hate_speech['HOF'].unique()

array(['Non-Hateful', 'Hateful'], dtype=object)

In [41]:
# Replace values of "Non-Hateful" with 0 and values of "Hateful" with 1
us_elections_2020_hate_speech['HOF'] = us_elections_2020_hate_speech['HOF'].replace({'Non-Hateful': 0, 'Hateful': 1})

  us_elections_2020_hate_speech['HOF'] = us_elections_2020_hate_speech['HOF'].replace({'Non-Hateful': 0, 'Hateful': 1})


In [42]:
# View the distribution of HOF values
us_elections_2020_hate_speech['HOF'].value_counts()

Unnamed: 0_level_0,count
HOF,Unnamed: 1_level_1
0,2648
1,352


In [43]:
us_elections_2020_hate_speech_imbalanced = us_elections_2020_hate_speech

In [44]:
# Resample 0 as the majority class and 1 as the minority class in the HOF column

#set the minority class to a seperate dataframe
us_elections_2020_hate_speech_minority_y = us_elections_2020_hate_speech[us_elections_2020_hate_speech['HOF'] == 1]
#set other classes to another dataframe
us_elections_2020_hate_speech_majority_y = us_elections_2020_hate_speech[us_elections_2020_hate_speech['HOF'] == 0]
#upsample the minority class
us_elections_2020_hate_speech_minority_y_upsampled = resample(us_elections_2020_hate_speech_minority_y,random_state=42,n_samples=(len(us_elections_2020_hate_speech_majority_y)),replace=True)

In [45]:
#concatenate the upsampled dataframe
us_elections_2020_hate_speech_upsampled = pd.concat([us_elections_2020_hate_speech_minority_y_upsampled, us_elections_2020_hate_speech_majority_y]).reset_index()
us_elections_2020_hate_speech_upsampled

Unnamed: 0,index,Trump,Biden,HOF
0,873,1.0,4.0,1
1,2947,0.0,3.0,1
2,2274,0.0,4.0,1
3,902,1.0,4.0,1
4,554,4.0,1.0,1
...,...,...,...,...
5291,2995,0.0,4.0,0
5292,2996,4.0,1.0,0
5293,2997,4.0,0.0,0
5294,2998,1.0,4.0,0


In [46]:
us_elections_2020_hate_speech_upsampled['HOF'].value_counts()

Unnamed: 0_level_0,count
HOF,Unnamed: 1_level_1
1,2648
0,2648


In [47]:
us_elections_2020_hate_speech = us_elections_2020_hate_speech_upsampled

In [48]:
# Output the us_elections_2020_hate_speech dataset
us_elections_2020_hate_speech.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/us_elections_2020_hate_speech_final.csv", index=False)
us_elections_2020_hate_speech_imbalanced.to_csv("/content/drive/MyDrive/Online MSDS/MOD C2/Political Polarization/data/us_elections_2020_hate_speech_imbalanced.csv", index=False)