# Project 2 NLP: Hatespeech Classifier

## Authors:

Adrian Obermühlner & Freja Rasmussen

## Resarch Question:

How do different preprocessing methods (nothing, stop word removal, lemming, stemming,…) affect the result of a hate speech classifier?

## Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import torch

# Preprocessing imports
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Tokenizing
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader


In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1650 Ti


## Data Import


In [3]:
RANDOM_SEED = 42
BINARY_LABEL = "is_hate"
CATEGORIES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

np.random.seed(RANDOM_SEED)  # set random seed for reproducibility
# Make the labels into hate and no hate as 1 and 0

def binarize_labels(df):
    return (df[CATEGORIES].sum(axis=1) > 0).astype(int)

data_train = pd.read_csv("./data/train/train.csv", index_col=0)
data_train[BINARY_LABEL] = binarize_labels(data_train)

data_test = pd.read_csv("./data/test/test.csv", index_col=0).join(
    pd.read_csv("./data/test_labels/test_labels.csv", index_col=0)
)
data_test.drop(data_test[data_test["toxic"] == -1].index, inplace=True)
data_test[BINARY_LABEL] = binarize_labels(data_test)

In [4]:
data_train['comment_text'].head(10)

id
0000997932d777bf    Explanation\nWhy the edits made under my usern...
000103f0d9cfb60f    D'aww! He matches this background colour I'm s...
000113f07ec002fd    Hey man, I'm really not trying to edit war. It...
0001b41b1c6bb37e    "\nMore\nI can't make any real suggestions on ...
0001d958c54c6e35    You, sir, are my hero. Any chance you remember...
00025465d4725e87    "\n\nCongratulations from me as well, use the ...
0002bcb3da6cb337         COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
00031b1e95af7921    Your vandalism to the Matt Shirvington article...
00037261f536c51d    Sorry if the word 'nonsense' was offensive to ...
00040093b2687caa    alignment on this subject and which are contra...
Name: comment_text, dtype: object

In [5]:
data_test.head(10)

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0,0
000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0,0
0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0,0
0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0,0
00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0,0
000663aff0fffc80,this other one from 1897,0,0,0,0,0,0,0
000689dd34e20979,== Reason for banning throwing == \n\n This ar...,0,0,0,0,0,0,0
000844b52dee5f3f,|blocked]] from editing Wikipedia. |,0,0,0,0,0,0,0
00091c35fa9d0465,"== Arabs are committing genocide in Iraq, but ...",1,0,0,0,0,0,1
000968ce11f5ee34,Please stop. If you continue to vandalize Wiki...,0,0,0,0,0,0,0


In [6]:
# get the distribution of the labels to see if roughly similar for both

is_hate_count_train = data_train['is_hate'].value_counts()
ratio_train = is_hate_count_train/ len(data_train)

is_hate_count_test = data_test['is_hate'].value_counts()
ratio_test = is_hate_count_test/ len(data_test)

print('Ratio of no/is hate for train set: ', ratio_train)
print('Ratio of no/is hate for test set: ', ratio_test)

Ratio of no/is hate for train set:  0    0.898321
1    0.101679
Name: is_hate, dtype: float64
Ratio of no/is hate for test set:  0    0.90242
1    0.09758
Name: is_hate, dtype: float64


## Representation

## Data Preprocessing

**Note**: We would need to make a loop for the different combinations of 
preprocessing (none, only stemming, only lemming, only stop word removal and every combination of this)
Either as coloumns that can be used to iterate over for the model training and validation, or make the preprocessing
and then go further and repeat from beginning.


In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    
    # Lowercase all text    
    text = text.lower()
    
    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lemming of words
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens if word.isalpha()]
    
    # Join the stemmed words back into a sentence
    return ' '.join(lemmatized_tokens)

data_train['comment_text_clean'] = data_train['comment_text'].apply(preprocess_text)
data_test['comment_text_clean'] = data_test['comment_text'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\flras\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\flras\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\flras\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
# Train and Test set
X_train = data_train["comment_text_clean"]
y_train = data_train["is_hate"]

X_test = data_test["comment_text_clean"]
y_test = data_test["is_hate"]

In [18]:
maxLen = 0

for comment in data_train['comment_text_clean']:
    length = len(comment.split())
    if(length > maxLen):
        maxLen = length

print(maxLen)

1250


## Word Embedding



**Notes**: Tokenizing with TF-IDF

In [19]:
tfidf_vectorizer = TfidfVectorizer(max_features=1500)

In [20]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Convert TF-IDF matrices to PyTorch tensors
X_train_tensor = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

## Model Implementation & Test with Testset

**Note**: Does a CNN makes sense for sentiment analysis? or a simpler model?

**Answers and additional Notes**:
Make a CNN with PyTorch using skorch as wrapper to make it possible to use sklearn.pipeline with the model
This way gridsearch for hyper parameters is possible and tfidfVectorizer can be used for tf-idf
CNN: vector size 300, conv. layer of some size, flatten, relu, end with softmax or something
Example: https://www.kaggle.com/code/raviusz/jigsaw-toxic-comment
example look very good to get basics and then change some of architecture
hyperparameter tuning for each model? only if time permits, alt. tune on best model and use for rest

**Note**: We will use the given test set to compare the different approaches. Make a dataframe with all the results
in accuracy, f1, recall, etc. 

In [21]:

# CNN: The basic model

class CNN(nn.Module):
    def __init__(self, input_size):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=5)
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=32, kernel_size=5)

        self.conv_output_size = self._get_conv_output_size(input_size)
        self.fc1 = nn.Linear(self.conv_output_size, 128)  # Adjust input size based on output of convolutions
        self.fc2 = nn.Linear(128, 2)  # Assuming binary classification

    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.max_pool1d(x, 2)
        x = x.view(x.size(0), -1)
        x = nn.functional.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

    def _get_conv_output_size(self, input_size):
        x = torch.randn(1, 1, input_size)
        x = nn.functional.relu(self.conv1(x))
        x = nn.functional.relu(self.conv2(x))
        x = nn.functional.max_pool1d(x, 2)
        return x.view(1, -1).size(1)


batch_size = 25
# Step 4: Train the model
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)  # Assuming X_train_tensor and y_train_tensor are tensors
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Move model to GPU
model = CNN(1500).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 7

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * inputs.size(0)
    
    epoch_loss = total_loss / len(train_dataset)
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss}')

# Step 5: Evaluate the model
# Assuming X_test_tensor and y_test_tensor are tensors
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy}')


Epoch [1/7], Loss: 0.41500277149124526
Epoch [2/7], Loss: 0.414940755776564
Epoch [3/7], Loss: 0.4149407556786989
Epoch [4/7], Loss: 0.414940755716052
Epoch [5/7], Loss: 0.41494075609798703
Epoch [6/7], Loss: 0.4149407555104234
Epoch [7/7], Loss: 0.4149407559700528
Test Accuracy: 0.90241958173122
