# Install and Import Dependencies

In [8]:
% cd "/content/drive/MyDrive/Colab Notebooks/Rewire"

/content/drive/MyDrive/Colab Notebooks/Rewire


In [28]:
# Try unstaging and using gitignore for models
# Try commiting just one file

b977ed27462f669b88e2b1e21851baa94fadef12 refs/heads/master


In [36]:
!git status

On branch master
Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	[31mmodified:   1.load_and_clean_data.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!pip install datasets transformers[sentencepiece]
!pip install emoji

Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 6.3 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 60.6 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 54.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 56.9 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 54.9 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface

In [None]:
# Standard libraries
import os
import pandas as pd
import re

# For Twitter data
import emoji
from html import unescape

# Loading data
from datasets import load_dataset

# Splitting data
from sklearn.model_selection import train_test_split

# Load Data

+ Look at [Connected papers](https://www.connectedpapers.com/main/8dd6a2c9c88c9b3465484228c93f4dcc11cfeab9/Automated-Hate-Speech-Detection-and-the-Problem-of-Offensive-Language/graph) and [Davidson et al. (2017)](https://arxiv.org/abs/1703.04009) was the most popular paper for hate speech.
+ Data set allowed for simple binary classification of hate speech / no hate speech.
+ Downloaded from [here](https://huggingface.co/datasets/hate_speech_offensive) through HuggingFace's datasets library.

__How was dataset compiled?__

+ Used Twitter API to search for words and phrases identified as hate speech from Hatebase.org.
+ 25k tweets randomly selected from corresponding users, but had to contain hate speech terms.
+ Labels were generated through CrowdFlower. Workers were asked to label each tweet as: hate speech, offensive but not hate speech, and neither offensive nor hate speech.
+ Workers were provided with hate speech definition.
+ Each tweet labelled by three or more workers > majority decision was used for ground truth (tweets with no majority were excluded).
+ __Definition:__ _language that is used to expresses hatred towards a targeted group or is intended to be derogatory, to humiliate, or to insult the members of the group_

In [None]:
# Load dataset from HuggingFace - only contains 'train' in normal format
dataset = load_dataset("hate_speech_offensive")
df = dataset['train'].to_pandas()

# Check data correctly loaded in
df.head()

Downloading builder script:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/823 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset hate_speech_offensive/default (download: 2.43 MiB, generated: 3.06 MiB, post-processed: Unknown size, total: 5.49 MiB) to /root/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5...


Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24783 [00:00<?, ? examples/s]

Dataset hate_speech_offensive downloaded and prepared to /root/.cache/huggingface/datasets/hate_speech_offensive/default/1.0.0/5f5dfc7b42b5c650fe30a8c49df90b7dbb9c7a4b3fe43ae2e66fabfea35113f5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [None]:
# Drop unnecessary variables
drop_cols = ['count', 'hate_speech_count', 'offensive_language_count', 
             'neither_count'
]
df.drop(columns=drop_cols, inplace=True)

# Change labels
df['class'].replace({0: 'hate_speech'}, inplace=True)
df['class'].replace({1: 0, 2: 0, 'hate_speech': 1}, inplace=True)

# Check classes
df['class'].value_counts()

0    23353
1     1430
Name: class, dtype: int64

# Clean Data

In [None]:
def clean_tweet(tweet):

  # Convert to lower case
  ret = tweet.lower()

  # Remove contractions and HTML entities
  ret = re.sub("'", "", ret)  # e.g. don't -> dont
  ret = unescape(ret)  # e.g. %amp; -> &

  # Remove new lines and tabs
  ret = re.sub("\n"," ", ret)
  ret = re.sub("\t"," ", ret)

  # Replace hashtags, URLs, mentions and emojis with special tokens
  ret = re.sub("#[A-Za-z0-9_]+","[HASH]", ret)  # missing info here
  ret = re.sub("http\S+", "[URL]", ret)
  ret = re.sub("@[A-Za-z0-9_]+","[MENTION]", ret)
  ret = emoji.replace_emoji(ret, "[EMOJI]")  # missing info here too

  # Data has !s at start of tweets - strip these
  ret = ret.lstrip("!")

  return ret


# Test cleaning
for i in range(3):
  test = df.loc[i, "tweet"]
  print("Before: ", test)
  print("After: ", clean_tweet(test))

Before:  !!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out...
After:   rt [MENTION]: as a woman you shouldnt complain about cleaning up your house. & as a man you should always take the trash out...
Before:  !!!!! RT @mleew17: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
After:   rt [MENTION]: boy dats cold...tyga dwn bad for cuffin dat hoe in the 1st place!!
Before:  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit
After:   rt [MENTION] dawg!!!! rt [MENTION]: you ever fuck a bitch and she start to cry? you be confused as shit


In [None]:
df['tweet'] = df['tweet'].apply(clean_tweet)

# Save Data

In [None]:
# Create directory to hold data
fpath = "/content/drive/MyDrive/Colab Notebooks/Rewire/data"
os.mkdir(fpath)

In [None]:
# Split data according to 80:20 split
# Preserve class counts using stratification
SEED = 1305
train, res = train_test_split(df, train_size=0.8, random_state=SEED,
                              stratify=df['class'])
val, test = train_test_split(res, train_size=0.5, random_state=SEED,
                             stratify=res['class'])

In [None]:
print(train['class'].value_counts())
print(val['class'].value_counts())
print(test['class'].value_counts())

0    18682
1     1144
Name: class, dtype: int64
0    2335
1     143
Name: class, dtype: int64
0    2336
1     143
Name: class, dtype: int64


In [None]:
# Save csv files to data directory
train.to_csv(fpath + "/train.csv", index=False)
val.to_csv(fpath + "/val.csv", index=False)
test.to_csv(fpath + "/test.csv", index=False)