<a href="https://colab.research.google.com/github/andresvBSE/NLP-Quick-Notebooks/blob/main/hate_speech_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Proof of Concept: Hate Speech identification using NLP Classification Techniques

This notebook demonstrates how well BERT performs compared to Doc2Vec for text classification. A deep evaluation of the classifier performance is omitted in this version

### doc2vec

In [None]:
pip install gensim



In [None]:
import re
import pandas as pd
import numpy as np

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df = pd.read_parquet("hf://datasets/tdavidson/hate_speech_offensive/data/train-00000-of-00001.parquet")
print(df.shape)
df.head()

(24783, 6)


Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [None]:
# 0 Hate
# 1 Offensive
# 2 Neither

In [None]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1,19190
2,4163
0,1430


Preprocessing

In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", text)  # Remove links, mentions, hashtags
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters
    text = re.sub(r"rt ", "", text)  # Remove the rt
    return text

df["tweet"] = df["tweet"].apply(clean_text)

In [None]:
df[["tweet", "class"]].sample(10)

Unnamed: 0,tweet,class
5703,bitch yessssssss omg i jus saw one of em yest...,1
19064,me lets try a nd attempt of my first day of w...,1
6408,naw really bitch i mean yo address,1
6434,bitch cause u wanted to fight and you didnt s...,1
18980,when the pussy so good you cant pull out ever,1
24425,sick and watching the yankees game,2
3207,aids fuck smdh u punk bitch,1
10328,i get more nudes than all you bitches,1
1701,sydneys my favorite shes a nigger but i lov...,1
14795,uhh bitch dont tell him bless you when he sne...,1


Feature extraction using doc2vec

In [None]:
# Tag and tokenize documents
tagged_docs = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[i]) for i, doc in enumerate(df["tweet"])]

In [None]:
# Creation of the embedding
#model = Doc2Vec(tagged_docs, vector_size=50, window=2, min_count=1, workers=4, epochs=100)

model = Doc2Vec(vector_size=50, window=2, min_count=1, workers=4, epochs=40)
model.build_vocab(tagged_docs)
model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.dv.vectors

array([[ 0.41172102,  0.92762756,  0.17197743, ..., -0.15746708,
        -0.43023977, -0.19311401],
       [-0.35000595, -0.173231  , -0.01797272, ..., -0.37771216,
        -0.04407789,  0.11313123],
       [ 0.38363904,  0.13514496,  0.15318188, ...,  0.2450623 ,
         0.23286927, -0.06440625],
       ...,
       [-0.06679089,  0.19040409, -0.3556444 , ...,  0.20945835,
        -0.3676932 ,  0.12771498],
       [-0.11768636,  0.08253502, -0.0735552 , ..., -0.03843921,
        -0.1033216 , -0.07112855],
       [-0.22610714,  0.13842842,  0.19075379, ..., -0.9870363 ,
         0.254402  ,  0.21366833]], dtype=float32)

In [None]:
# Convert arrays to pandas data frame
y = df["class"]
X = pd.DataFrame(model.dv.vectors)
X.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
22247,-0.473599,0.033474,0.244045,-0.209016,-0.116598,-0.663499,-0.072867,0.296895,-0.327417,-0.114478,...,0.188741,-0.004067,0.375998,0.163245,-0.034789,-0.074026,0.059702,-0.066587,0.646589,0.045828
4208,-0.10392,-0.283527,-0.244243,-0.047588,-0.203303,0.221033,0.323213,0.102835,-0.212207,-0.100447,...,0.382796,0.02997,-0.216016,-0.134006,0.164935,-0.103302,0.211117,-0.365369,0.206383,0.16932
19521,-0.122393,-0.042349,-0.133349,-0.348638,0.025199,-0.504163,0.372282,0.158527,0.008545,-0.305996,...,0.078423,0.200234,-0.191509,0.218953,0.073608,0.040094,0.080846,-0.634363,0.15305,0.162972
24113,0.069476,-0.317302,0.108719,0.299673,0.347303,0.254282,0.076732,0.162294,0.185098,0.48029,...,0.007286,-0.473727,-0.226825,0.417789,0.68409,-0.343245,-0.204817,0.013574,-0.053173,0.107508
10648,-0.134278,-0.223787,-0.388677,-0.017014,-0.32794,-0.111646,0.243314,0.463184,-0.623772,-0.478876,...,0.237086,-0.328002,0.096677,0.134624,0.085553,-0.306034,0.067471,0.222491,-0.169762,-0.099979


Creation of a classification model (random forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfC = RandomForestClassifier(n_estimators=100, max_depth=5, class_weight="balanced", random_state=434)
rfC.fit(X, y)

Use the model to predict hate, or offesive speech

In [None]:
# Infer vector for a new document
new_text = ["I have issues to gain weight", "fuck the police", "Mr Sleepy should go to bed early", "Rock and Roll and Movies forever", "The children are the future of the world"]

new_vectors_list = []
for text in new_text:
    new_vectors_array = model.infer_vector(word_tokenize(text))
    new_vectors_list.append(list(new_vectors_array))

X_n = pd.DataFrame(new_vectors_list)

rfC.predict(X_n)

array([0, 0, 1, 1, 0])

In [None]:
# 0 Hate
# 1 Offensive
# 2 Neither

## BERT



Feature extraction using BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [None]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Load Pretrained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Put model in evaluation mode (no gradients needed)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# prompt: how to run the bert fine tuning with the use of the GPU

import pandas as pd
import numpy as np
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model to GPU if available
model.to(device)

# Function to get BERT embeddings in batches to manage memory on GPU
def get_bert_embeddings(texts, tokenizer, model, device, batch_size=32):
    model.eval()  # Set model to evaluation mode
    embeddings = []
    with torch.no_grad(): # Disable gradient calculation
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            # Tokenize the batch
            encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to(device)
            # Get embeddings
            output = model(**encoded_input)
            # Use the representation of the first token (CLS) as the sentence embedding
            batch_embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Get BERT embeddings for the entire dataset
# Adjust batch_size based on your GPU memory
X_bert = get_bert_embeddings(df["tweet"].tolist(), tokenizer, model, device, batch_size=64)

# Convert BERT embeddings to pandas DataFrame
X_bert_df = pd.DataFrame(X_bert)




Using device: cuda


100%|██████████| 388/388 [00:53<00:00,  7.30it/s]


In [None]:
# Use the BERT embeddings for classification
from sklearn.ensemble import RandomForestClassifier
rfC_bert = RandomForestClassifier(n_estimators=100, max_depth=5, class_weight="balanced", random_state=434)
rfC_bert.fit(X_bert_df, y)

# Infer BERT vectors for new documents
new_text = ["I have issues to gain weight", "fuck the police", "Mr Sleepy should go to bed early", "Rock and Roll and Movies forever", "The children are the future of the world"]

# Get BERT embeddings for the new text
X_n_bert = get_bert_embeddings(new_text, tokenizer, model, device, batch_size=5) # Use a batch size suitable for the number of new texts

# Predict using the model trained on BERT embeddings
predictions_bert = rfC_bert.predict(X_n_bert)

print("Predictions using BERT embeddings:")
predictions_bert

100%|██████████| 1/1 [00:00<00:00, 63.86it/s]

Predictions using BERT embeddings:





array([2, 0, 2, 2, 2])

In [None]:
# 0 Hate
# 1 Offensive
# 2 Neither