<a href="https://colab.research.google.com/github/antoinebossan1/Toxic_comment_classification/blob/main/Toxic_comment_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Installations

In [None]:
!pip install torchmetrics --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/840.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/840.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m839.7/840.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import csv
import pickle
import string
import random

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
from torchtext.data.utils import get_tokenizer
from nltk.probability import FreqDist
from tqdm.notebook import tqdm
from google.colab import drive
from torchvision import datasets, transforms
from torchmetrics import AUROC, F1Score
from transformers import BertModel, BertTokenizer

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))

## Setting GPU accelaration

In [None]:
# Set the device to GPU if available,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# Set a manual seed for reproducibility of results
torch.manual_seed(0)

<torch._C.Generator at 0x7d8edf83bef0>

## Data Exploration

In [None]:
DATA_DIR = 'kaggle_data/'

In [None]:
def load_data(file_name):
    """
    Load data from CSV file
    """
    return pd.read_csv(os.path.join(DATA_DIR, file_name), index_col=0)

In [None]:
def print_data_summary(data, label):
    """
    Print summary of the data
    """
    print(f"{label} Data:")
    print(data.head())
    print(data.describe())

In [None]:
def print_label_distribution(labels):
    """
    Print the distribution of labels in the dataset
    """
    label_columns = labels.columns
    print("\nLabel Distribution:")
    for col in label_columns:
        print(f"\nColumn: {col}")
        print(labels[col].value_counts())

In [None]:
# Loading Data
train_data = load_data('train_x.csv')
train_labels = load_data('train_y.csv')

In [None]:
# Basic Data Inspection
print_data_summary(train_data, "Train")
print_data_summary(train_labels, "Train Labels")

Train Data:
                                                  string
index                                                   
0                             even up here.......BLACKS!
1      Blame men.  There's always an excuse to blame ...
2      You have no business making any comments on th...
3      "Let's get the black folks and the white folks...
4      I guess the issue is people not willing to put...
            string
count       269038
unique      267694
top     Well said.
freq            16
Train Labels Data:
      female  LGBTQ  christian  muslim  other_religions  black  white  \
male                                                                    
0          0      0          0       0                0      1      0   
1          1      0          0       0                0      0      0   
0          0      0          0       0                0      0      0   
0          0      0          0       0                0      1      1   
0          0      0          0       

In [None]:
# Visualize Label Distribution
print_label_distribution(train_labels)


Label Distribution:

Column: female
0    232794
1     36244
Name: female, dtype: int64

Column: LGBTQ
0    260618
1      8420
Name: LGBTQ, dtype: int64

Column: christian
0    242300
1     26738
Name: christian, dtype: int64

Column: muslim
0    255084
1     13954
Name: muslim, dtype: int64

Column: other_religions
0    262494
1      6544
Name: other_religions, dtype: int64

Column: black
0    259142
1      9896
Name: black, dtype: int64

Column: white
0    252340
1     16698
Name: white, dtype: int64

Column: identity_any
0    155568
1    113470
Name: identity_any, dtype: int64

Column: severe_toxicity
0    269031
1         7
Name: severe_toxicity, dtype: int64

Column: obscene
0    267355
1      1683
Name: obscene, dtype: int64

Column: threat
0    268379
1       659
Name: threat, dtype: int64

Column: insult
0    250565
1     18473
Name: insult, dtype: int64

Column: identity_attack
0    261236
1      7802
Name: identity_attack, dtype: int64

Column: sexual_explicit
0    267691
1  

Given the observed imbalances in the dataset, we have chosen to implement data augmentation on the comments labeled 1. This will involve generating additional comments by substituting words with their synonyms.

# Data Augmentation

In [None]:
def get_synonyms(word):
    """
    Get synonyms of a word
    """
    synonyms = set()

    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)

    if word in synonyms:
        synonyms.remove(word)

    return list(synonyms)

In [None]:
def synonym_replacement(words, n):
    """
    Replace up to n words in the text with their synonyms.
    """
    words = words.split()

    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0

    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)

        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1

        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)

    return sentence


In [None]:
# Testing the result of the data augmentation function
for i in range(5):
    print(f"Sample {i}:")
    print("Label:", train_labels.iloc[i, 0])
    print("Original Data:", train_data.iloc[i, 0])
    print("Augmented Data:", synonym_replacement(synonym_replacement(train_data.iloc[i, 0], n=2), n=2))
    print()

Sample 0:
Label: 0
Original Data: even up here.......BLACKS!
Augmented Data: bang up here.......BLACKS!

Sample 1:
Label: 1
Original Data: Blame men.  There's always an excuse to blame men for the failures of women.
Augmented Data: Blame men. There's always an apologise to blame humankind for the loser of women.

Sample 2:
Label: 0
Original Data: You have no business making any comments on this site, Craig.  For all the  bigot and racists comments that you've personally posted on this site against rural Alaska Natives, the homeless, and the LGBT community, you should be banned from this site.  Alaska and the nation don't need your kind breathing our air.  No.  It's not time to move on.  Every time you rear your bigot, racist head, I'll be  there to call you down and so should everyone else that is exposed to your garbage.
Augmented Data: You have no business making any comments on this site, Craig. For all the bigot and racists comments that you've in person posted on this site against

## Pre-processing

In [None]:
class CustomDataset(Dataset):
    """
    A custom dataset class for text data, supporting different models and data modes.
    Supports data augmentation and various preprocessing steps for transformers and LSTM.
    """
    def __init__(self, data_dir, mode, model_type='bert', max_len=512, vectorizer=None, augment=False):
        super(CustomDataset, self).__init__()
        assert mode in ['train', 'val', 'full_train', 'test']
        self.mode = mode
        self.model_type = model_type
        self.max_len = max_len

        if self.mode == 'full_train':
            train_data = pd.read_csv(os.path.join(data_dir, 'train_x.csv'), index_col=0)
            val_data = pd.read_csv(os.path.join(data_dir, 'val_x.csv'), index_col=0)
            self.data = pd.concat([train_data, val_data]).reset_index(drop=True)

            train_label = pd.read_csv(os.path.join(data_dir, 'train_y.csv'))
            val_label = pd.read_csv(os.path.join(data_dir, 'val_y.csv'))
            self.label = pd.concat([train_label, val_label]).reset_index(drop=True)
        else:
            self.data = pd.read_csv(os.path.join(data_dir, f'{mode}_x.csv'), index_col=0)
            if self.mode != 'test':
                self.label = pd.read_csv(os.path.join(data_dir, f'{mode}_y.csv'))

        if self.model_type == 'bert':
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif self.model_type == 'lstm':
            if self.mode in ['train', 'full_train']:
                self.vectorizer = CountVectorizer() if vectorizer is None else vectorizer
                self.vectorizer.fit(self.data.values.flatten().tolist())
            else:
                if vectorizer is None:
                    raise ValueError("Vectorizer missing for LSTM test mode")
                self.vectorizer = vectorizer

        if augment:
            self.apply_data_augmentation()

    def apply_data_augmentation(self):
        for i in range(len(self.label)):
            if self.label.iloc[i, -2] == 1:
                original_text = self.data.iloc[i, 0]
                augmented_text = self.augment_text(original_text)
                self.data.iloc[i, 0] = augmented_text

    def augment_text(self, text):
            return synonym_replacement(text, n=2)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data.iloc[idx, 0]

        if self.model_type == 'bert':
            tokens = self.tokenizer.encode_plus(
                x,
                max_length=self.max_len,
                truncation=True,
                padding='max_length',
                return_tensors='pt',
            )
            input_ids = tokens['input_ids'].flatten()
            attention_mask = tokens['attention_mask'].flatten()

            if self.mode == 'test':
                return input_ids, attention_mask, idx
            else:
                y = torch.tensor(self.label.iloc[idx, -2])
                return input_ids, attention_mask, y, idx

        elif self.model_type == 'lstm':
            x = self.vectorizer.transform([x]).toarray()
            x = torch.tensor(x).float()

            if self.mode == 'test':
                return x, idx
            else:
                y = torch.tensor(self.label.iloc[idx, -2])
                return x, y, idx


In [None]:
# Creating training and validation datasets for BERT model with data augmentation enabled
train_dataset_bert = CustomDataset(DATA_DIR, 'train', model_type='bert',augment=True)
val_dataset_bert = CustomDataset(DATA_DIR, 'val', model_type='bert',augment=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Creating dataloaders for BERT model datasets.
train_dataloader_bert = DataLoader(train_dataset_bert, batch_size=128, shuffle=True)
val_dataloader_bert = DataLoader(val_dataset_bert, batch_size=128, shuffle=False)

In [None]:
# Creating training and validation datasets for LSTM model with data augmentation
train_dataset_lstm = CustomDataset(DATA_DIR, 'train', model_type='lstm',augment=True)
val_dataset_lstm = CustomDataset(DATA_DIR, 'val', model_type='lstm', vectorizer=train_dataset_lstm.vectorizer,augment=True)

In [None]:
# Creating dataloaders for LSTM model datasets
train_dataloader_lstm = DataLoader(train_dataset_lstm, batch_size=1024, shuffle=True)
val_dataloader_lstm = DataLoader(val_dataset_lstm, batch_size=1024, shuffle=False)

## Defining Models

LSTM Classifier

In [None]:
class LSTMClassifier(nn.Module):
    """
    A LSTM-based classifier for sequence modeling
    This model utilizes a Long Short-Term Memory (LSTM) layer followed by a fully connected layer
    """

    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = x.to(device)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.lstm(x, (h0, c0))
        out = out[:, -1, :]
        out = self.fc(out)
        return torch.sigmoid(out)

BERT Classifier

In [None]:
from transformers import BertModel

class BERTClassifier(nn.Module):
    """
    A classifier that utilizes a pre-trained BERT (Bidirectional Encoder Representations from Transformers) model
    """
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('google/bert_uncased_L-4_H-256_A-4')
        self.fc = nn.Linear(256, 1)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        return torch.sigmoid(self.fc(pooled_output))


## Defining training and evaluation functions

Evaluation Function

In [None]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

In [None]:
def evaluate_model(model, dataloader, criterion):
    """
    Evaluate the model on a given dataloader.
    argument:
        model [torch.nn.Module]: model to evaluate
        dataloader [torch.utils.data.DataLoader]: dataloader on which to evaluate
        criterion [torch.nn.modules.loss]: desired loss to compute
    returns:
        dataset_loss [float]: computed loss on the dataset
        dataset_metric [float]: computed metric on the dataset
    """
    model.eval()
    losses, predictions, indices = [], [], []
    for batch in tqdm(dataloader, leave=False):
        if isinstance(model, BERTClassifier):
            input_ids, attention_mask, y, idx = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            with torch.no_grad():
                pred = model(input_ids, attention_mask)
        else:
            x, y, idx = batch
            x = x.to(device)
            with torch.no_grad():
                pred = model(x)

        y = y.to(device)
        loss = criterion(pred.squeeze(), y.squeeze().float())
        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, dataloader.dataset.label)
    return dataset_loss, dataset_metric


Training Function

In [None]:
def train_model(model, optimizer, criterion, dataloader):
    """
    Train a model for one epoch.
    arguments:
        model [torch.nn.Module]: model to train
        optimizer [torch.optim]: optimizer used for training
        criterion [torch.nn.modules.loss]: desired loss to compute
        dataloader [torch.utils.data.DataLoader]: dataloader used for training
    returns:
        dataset_loss [float]: computed loss on the dataset
        dataset_metric [float]: computed metric on the dataset
    """
    model.train()
    losses, predictions, indices = [], [], []
    for batch in tqdm(dataloader, leave=False):
        if isinstance(model, BERTClassifier):
            input_ids, attention_mask, y, idx = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
            optimizer.zero_grad()
            pred = model(input_ids, attention_mask)
        else:
            x, y, idx = batch
            x = x.to(device)
            optimizer.zero_grad()
            pred = model(x)

        y = y.to(device)
        loss = criterion(pred.squeeze(), y.squeeze().float())
        loss.backward()
        optimizer.step()

        losses.extend([loss.item()] * len(y))
        predictions.extend(pred.detach().squeeze().tolist())
        indices.extend(idx.tolist())

    pred_df = pd.DataFrame({'index': indices, 'pred': predictions})
    dataset_loss = np.mean(losses)
    dataset_metric = worst_group_accuracy(pred_df, y=dataloader.dataset.label)
    return dataset_loss, dataset_metric


## Train and Evaluate Both Models

In [None]:
# Define Loss Function and Evaluation Metric
criterion = nn.BCELoss()
metric = F1Score(task='binary')

LSTM Classifier

In [None]:
# LSTM Model Configuration and Training
input_size = len(train_dataset_lstm.vectorizer.get_feature_names_out())
hidden_size = 128
num_layers = 2
num_classes = 1

In [None]:
# Train and Evaluate LSTM Model
lstm_model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes).to(device)
lstm_optimizer = optim.AdamW(lstm_model.parameters(), lr=0.05, weight_decay=0.1)
train_model(lstm_model, lstm_optimizer, criterion, train_dataloader_lstm)
lstm_val_loss, lstm_val_metric = evaluate_model(lstm_model, val_dataloader_lstm, criterion)

  0%|          | 0/263 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

In [None]:
lstm_val_loss, lstm_val_metric

(0.20741829273974066, 0.7730024213075061)

BERT Classifier

In [None]:
# BERT Model Configuration and Training
bert_model = BERTClassifier().to(device)
bert_optimizer = optim.AdamW(bert_model.parameters(), lr=5e-5, weight_decay=0.01)
train_model(bert_model, bert_optimizer, criterion, train_dataloader_bert)
bert_val_loss, bert_val_metric = evaluate_model(bert_model, val_dataloader_bert, criterion)

config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

  0%|          | 0/2102 [00:00<?, ?it/s]

  0%|          | 0/353 [00:00<?, ?it/s]

In [None]:
# Train and Evaluate BERT Model
bert_val_loss, bert_val_metric

(0.14876577649273773, 0.837772397094431)

The BERT model shows a stronger performance in validation, as indicated by its lower loss (0.1488) and higher metric score (0.8377), compared to the LSTM model's loss (0.2074) and metric score (0.7730)

# Evaluate BERT Model on Test Dataset

In [None]:
# Initialize test dataset and dataloader for BERT model evaluation
test_dataset_bert = CustomDataset(DATA_DIR, 'test', model_type='bert')
test_dataloader_bert = DataLoader(test_dataset_bert, batch_size=256, shuffle=False)

In [None]:
# Set BERT model to evaluation mode and initialize containers for predictions
bert_model.eval()
test_predictions_bert, indices = [], []

# Iterate over test data batches to generate predictions
for batch in tqdm(test_dataloader_bert, leave=False):
    if isinstance(bert_model, BERTClassifier):
        input_ids, attention_mask, idx = batch[:3]
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        with torch.no_grad():
            pred = bert_model(input_ids, attention_mask).squeeze()
        test_predictions_bert.extend((pred > 0.5).int().tolist())
        indices.extend(idx.tolist())

  0%|          | 0/523 [00:00<?, ?it/s]

In [None]:
# Create a DataFrame from the predictions and save to a CSV file
pred_df = pd.DataFrame({'ID': indices, 'pred': test_predictions_bert})
pred_df.to_csv('predictions_bert.csv', index=False)