In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

cp: cannot stat 'kaggle.json': No such file or directory


In [2]:
!kaggle datasets download -d mrmorj/hate-speech-and-offensive-language-dataset

Dataset URL: https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset
License(s): CC0-1.0
Downloading hate-speech-and-offensive-language-dataset.zip to /content
  0% 0.00/1.01M [00:00<?, ?B/s]
100% 1.01M/1.01M [00:00<00:00, 38.1MB/s]


In [3]:
import zipfile
zip_ref = zipfile.ZipFile('/content/hate-speech-and-offensive-language-dataset.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('/content/labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
df = df.head(2000)

In [7]:
df['tweet'][0]

"!!! RT @mayasolovely: As a woman you shouldn't complain about cleaning up your house. &amp; as a man you should always take the trash out..."

In [8]:
df.shape

(2000, 7)

### Data Preprocessing

In [9]:
df['tweet'][2]

# We can see that our text contains punctuations, brackets, HTML tags and numbers
# We will preprocess this text in the next section

'!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby4life: You ever fuck a bitch and she start to cry? You be confused as shit'

In [10]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase, and removes words starting with @ and RT'''

    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove RT and words starting with @
    sentence = re.sub(r'\brt\b', '', sentence)
    sentence = re.sub(r'@\w+', '', sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [13]:
df["tweet"] = df['tweet'].apply(preprocess_text)

In [14]:
df['tweet'][2]
# before preprocessing

' dawg ever fuck bitch start cry confused shit'

In [15]:
df["class"] = df["class"].map({0:1,1:0,2:0})
y = df["class"]
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,0,woman complain cleaning house amp man always ...
1,1,3,0,3,0,0,boy dats cold tyga dwn bad cuffin dat hoe st ...
2,2,3,0,3,0,0,dawg ever fuck bitch start cry confused shit
3,3,3,0,2,1,0,look like tranny
4,4,6,0,6,0,0,shit hear might true might faker bitch told ya


In [16]:
X = df["tweet"]
y = df["class"]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# The train set will be used to train our deep learning models
# while test set will be used to evaluate how well our model performs

In [18]:
X_train.shape

(1600,)

#### RoBERTa LSTM

In [19]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
import pandas as pd
import gc

In [20]:
# Preprocess text using RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def preprocess_text_batch(text_batch):
    inputs = tokenizer.batch_encode_plus(text_batch, return_tensors='tf', padding='max_length', truncation=True, max_length=768)
    return inputs

# def preprocess_text_batch(text_batch):
#     inputs = tokenizer.batch_encode_plus(text_batch, return_tensors='tf', padding='max_length', truncation=True, max_length=384)
#     return inputs

# Preprocess train and test data in batches

def preprocess_data_in_batches(X_data):
    batch_size = 8
    num_batches = len(X_data) // batch_size + 1
    X_embeddings = []

    for i in range(num_batches):
        print(f"This is: {i+1}")
        # Clear GPU memory
        tf.keras.backend.clear_session()
        gc.collect()

        batch_text = X_data[i * batch_size : (i + 1) * batch_size]

        if not batch_text.empty:  # Check if batch_text is not empty
            batch_inputs = preprocess_text_batch(batch_text)
            batch_embeddings = roberta_model(batch_inputs)['last_hidden_state']
            X_embeddings.append(batch_embeddings)

            # Explicitly release GPU memory
            del batch_text
            del batch_inputs
            del batch_embeddings
            gc.collect()

            # Concatenate embeddings if there are enough batches
            if len(X_embeddings) >= 10:
                X_embeddings_concat = tf.concat(X_embeddings, axis=0)
                X_embeddings = [X_embeddings_concat]

    return tf.concat(X_embeddings, axis=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [21]:
# Load RoBERTa model
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

# Get RoBERTa embeddings for train and test data
X_train_embeddings = preprocess_data_in_batches(X_train)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

This is: 1
This is: 2
This is: 3
This is: 4
This is: 5
This is: 6
This is: 7
This is: 8
This is: 9
This is: 10
This is: 11
This is: 12
This is: 13
This is: 14
This is: 15
This is: 16
This is: 17
This is: 18
This is: 19
This is: 20
This is: 21
This is: 22
This is: 23
This is: 24
This is: 25
This is: 26
This is: 27
This is: 28
This is: 29
This is: 30
This is: 31
This is: 32
This is: 33
This is: 34
This is: 35
This is: 36
This is: 37
This is: 38
This is: 39
This is: 40
This is: 41
This is: 42
This is: 43
This is: 44
This is: 45
This is: 46
This is: 47
This is: 48
This is: 49
This is: 50
This is: 51
This is: 52
This is: 53
This is: 54
This is: 55
This is: 56
This is: 57
This is: 58
This is: 59
This is: 60
This is: 61
This is: 62
This is: 63
This is: 64
This is: 65
This is: 66
This is: 67
This is: 68
This is: 69
This is: 70
This is: 71
This is: 72
This is: 73
This is: 74
This is: 75
This is: 76
This is: 77
This is: 78
This is: 79
This is: 80
This is: 81
This is: 82
This is: 83
This is: 84
T

In [22]:
X_test_embeddings = preprocess_data_in_batches(X_test)

This is: 1
This is: 2
This is: 3
This is: 4
This is: 5
This is: 6
This is: 7
This is: 8
This is: 9
This is: 10
This is: 11
This is: 12
This is: 13
This is: 14
This is: 15
This is: 16
This is: 17
This is: 18
This is: 19
This is: 20
This is: 21
This is: 22
This is: 23
This is: 24
This is: 25
This is: 26
This is: 27
This is: 28
This is: 29
This is: 30
This is: 31
This is: 32
This is: 33
This is: 34
This is: 35
This is: 36
This is: 37
This is: 38
This is: 39
This is: 40
This is: 41
This is: 42
This is: 43
This is: 44
This is: 45
This is: 46
This is: 47
This is: 48
This is: 49
This is: 50
This is: 51


In [23]:
# Build RoBERTa-LSTM model
def build_roberta_lstm_model():
    model = Sequential()
    model.add(LSTM(128, input_shape=(None, 768)))
    model.add(Dense(1, activation='sigmoid'))
    return model

roberta_lstm_model = build_roberta_lstm_model()

# Compile model
roberta_lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
roberta_lstm_model.fit(X_train_embeddings, y_train.values, batch_size=32, epochs=5, validation_split=0.2)

# Evaluate model
loss, accuracy = roberta_lstm_model.evaluate(X_test_embeddings, y_test.values)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.16599102318286896, Test Accuracy: 0.9424999952316284


In [25]:
!pip install innvestigate

Collecting innvestigate
  Downloading innvestigate-2.1.2-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<2.15,>=2.6 (from innvestigate)
  Downloading tensorflow-2.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (489.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.9/489.9 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.15,>=2.14 (from tensorflow<2.15,>=2.6->innvestigate)
  Downloading tensorboard-2.14.1-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.15,>=2.14.0 (from tensorflow<2.15,>=2.6->innvestigate)
  Downloading tensorflow_estimator-2.14.0-py2.py3-none-any.whl (440 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m440.7/440.7 kB[0m [31m43.6 MB/s

In [26]:
import numpy as np
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt
import innvestigate

In [29]:
import tensorflow as tf
tf.compat.v1.disable_eager_execution()

In [31]:
# Function to preprocess text
def preprocess_text_batch(text_batch):
    inputs = tokenizer.batch_encode_plus(text_batch, return_tensors='tf', padding='max_length', truncation=True, max_length=128)
    return inputs


In [33]:
# Example text
example_text = ["dawg ever fuck bitch start cry confused shit"]

# Wrap the model inference in tf.function to avoid eager execution issues
@tf.function
def get_embeddings(input_ids, attention_mask):
    return roberta_model(input_ids, attention_mask=attention_mask)['last_hidden_state']

# Preprocess the text
inputs = preprocess_text_batch(example_text)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Get RoBERTa embeddings
embeddings = get_embeddings(input_ids, attention_mask)

# Create an analyzer
analyzer = innvestigate.create_analyzer('gradient', roberta_lstm_model)

# Analyze the input
analysis = analyzer.analyze(np.array(embeddings))

# Visualize the results
def visualize_saliency(text, analysis):
    tokenized_text = tokenizer.tokenize(text[0])
    analysis = np.squeeze(analysis)
    scores = np.sum(analysis, axis=1)
    plt.figure(figsize=(10, 1))
    plt.bar(range(len(tokenized_text)), scores)
    plt.xticks(range(len(tokenized_text)), tokenized_text, rotation=90)
    plt.show()

visualize_saliency(example_text, analysis)

RuntimeError: Attempting to capture an EagerTensor without building a function.

In [28]:
# Using innvestigate for LRP
analyzer = innvestigate.create_analyzer('lrp.z', roberta_lstm_model)

# Analyze the input
analysis = analyzer.analyze(np.array(embeddings))

# Visualize the results
visualize_saliency(example_text, analysis)


Check triggered by layers: [<keras.src.layers.rnn.lstm.LSTM object at 0x782df03821d0>]


NotImplementedError: 

#### Multilingual BERT (mBERT)

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer, TFBertModel

# Preprocess text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def preprocess_text_batch(text_batch):
    inputs = tokenizer.batch_encode_plus(text_batch, return_tensors='tf', padding='max_length', truncation=True, max_length=128)
    return inputs

# Preprocess data in batches
def preprocess_data_in_batches(X_data):
    batch_size = 16
    num_batches = len(X_data) // batch_size + 1
    X_embeddings = []

    for i in range(num_batches):
        print(f"Processing batch {i+1}/{num_batches}")
        # Clear GPU memory
        tf.keras.backend.clear_session()
        gc.collect()

        batch_text = X_data[i * batch_size : (i + 1) * batch_size]

        if not batch_text.empty:
            batch_inputs = preprocess_text_batch(batch_text)
            batch_embeddings = bert_model(batch_inputs)['last_hidden_state']
            X_embeddings.append(batch_embeddings)

            # Explicitly release GPU memory
            del batch_text
            del batch_inputs
            del batch_embeddings
            gc.collect()

            # Concatenate embeddings if there are enough batches
            if len(X_embeddings) >= 10:
                X_embeddings_concat = tf.concat(X_embeddings, axis=0)
                X_embeddings = [X_embeddings_concat]

    return tf.concat(X_embeddings, axis=0)

In [None]:
# Load mBERT model
bert_model = TFBertModel.from_pretrained('bert-base-multilingual-cased')

# Get mBERT embeddings for train and test data
X_train_embeddings = preprocess_data_in_batches(X_train)

In [None]:
X_test_embeddings = preprocess_data_in_batches(X_test)

In [None]:
# Build BERT-LSTM model
def build_bert_lstm_model():
    model = Sequential()
    model.add(LSTM(128, input_shape=(None, 768)))
    model.add(Dense(1, activation='sigmoid'))
    return model

bert_lstm_model = build_bert_lstm_model()

# Compile model
bert_lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
bert_lstm_model.fit(X_train_embeddings, y_train.values, batch_size=32, epochs=5, validation_split=0.2)

# Evaluate model
loss, accuracy = bert_lstm_model.evaluate(X_test_embeddings, y_test.values)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')