# We are going to use Tansformers architecture to classify a speech text as Hate speech or not, in this project. 

# The dataset which we are going to use here is the Twitter Hate Speech and Offensive Language Dataset (HSOL Dataset), having around 30,000 twitter tweets. To know more about the dataset, following link can be navigated: 

https://paperswithcode.com/dataset/hate-speech-and-offensive-language

# Let's first clone the repository having the dataset

In [1]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [2]:
! git clone https://github.com/t-davidson/hate-speech-and-offensive-language.git

fatal: destination path 'hate-speech-and-offensive-language' already exists and is not an empty directory.


In [3]:
import pandas as pd
import re
import sklearn
from sklearn.model_selection import train_test_split
import numpy as np

In [4]:
data = pd.read_csv("/content/drive/MyDrive/hate-speech-and-offensive-language/data/labeled_data.csv")

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Let's first clean all the tweets. 

In [6]:
def clean_tweet(single_tweet):

  single_tweet = single_tweet.lower().strip()
  single_tweet = re.sub("(@[A-Za-z0-9]+)", "", single_tweet)
  single_tweet = re.sub("([^0-9A-Za-z \t])", "", single_tweet)
  return single_tweet

In [7]:
data["tweet"] = data["tweet"].apply(clean_tweet)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,rt as a woman you shouldnt complain about cl...
1,1,3,0,3,0,1,rt boy dats coldtyga dwn bad for cuffin dat ...
2,2,3,0,3,0,1,rt dawg rt you ever fuck a bitch and she st...
3,3,3,0,2,1,1,rt ganderson based she look like a tranny
4,4,6,0,6,0,1,rt the shit you hear about me might be true ...


In [9]:
data.drop(["Unnamed: 0"],axis=1,inplace=True)

In [10]:
data.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,rt as a woman you shouldnt complain about cl...
1,3,0,3,0,1,rt boy dats coldtyga dwn bad for cuffin dat ...
2,3,0,3,0,1,rt dawg rt you ever fuck a bitch and she st...
3,3,0,2,1,1,rt ganderson based she look like a tranny
4,6,0,6,0,1,rt the shit you hear about me might be true ...


In [11]:
input_tweets = data['tweet'].values
class_labels = data['class'].values

In [12]:
input_tweets

array([' rt  as a woman you shouldnt complain about cleaning up your house amp as a man you should always take the trash out',
       ' rt  boy dats coldtyga dwn bad for cuffin dat hoe in the 1st place',
       ' rt  dawg rt  you ever fuck a bitch and she start to cry you be confused as shit',
       ...,
       'young buck wanna eat dat nigguh like i aint fuckin dis up again',
       'youu got wild bitches tellin you lies',
       'ruffled  ntac eileen dahlia  beautiful color combination of pink orange yellow amp white a coll httptcoh0dyebvnzb'],
      dtype=object)

In [13]:
np.unique(class_labels)

array([0, 1, 2])

In [14]:
train_tweets, val_tweets, train_labels, val_labels = train_test_split(input_tweets, class_labels)

# Let's import the pretrained tokenizer of ALBERT from the transformers library to tokenize the tweets but for that we have to import transformers library and for that we have to first install transformers library.

In [15]:
! pip install transformers
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade
!pip install SentencePiece

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 45.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found exis

In [16]:
from transformers import AlbertTokenizer

In [17]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [18]:
train_tokens = tokenizer(list(train_tweets), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = tokenizer(list(val_tweets), return_tensors="pt", padding=True, truncation=True, max_length=64)

In [19]:
device = "cuda"

In [20]:
import torch

In [21]:
training_data_tokens = [train_tokens["input_ids"].to(device), train_tokens["attention_mask"].to(device),
      train_tokens["token_type_ids"].to(device), torch.tensor(train_labels).to(device)]

In [22]:
cv_data_tokens = [val_tokens["input_ids"].to(device), val_tokens["attention_mask"].to(device),
      val_tokens["token_type_ids"].to(device), torch.tensor(val_labels).to(device)]

In [23]:
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl

In [24]:
batch_size = 32

class DataGenerator(pl.LightningDataModule):

    def __init__(self, training_data_tokens, cv_data_tokens):

        super().__init__()
        self.trn = #Write your code here
        self.val = #Write your code here

    def train_dataloader(self): 
      return self.trn

    def val_dataloader(self): 
      return self.val

In [25]:
gen = DataGenerator(training_data_tokens, cv_data_tokens)

In [26]:
from transformers import AlbertModel

In [27]:
albert_model = AlbertModel.from_pretrained("albert-base-v2")

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
class AlbertClassifier(pl.LightningModule):

  def __init__(self, dropout_prob,hidden_layer_dim, output_dim):

    super().__init__()
    self.albert_mdl = albert_model
    self.dropout_layer = torch.nn.Dropout(dropout_prob)
    self.first_linear_layer = torch.nn.Linear(hidden_layer_dim,hidden_layer_dim)
    self.second_linear_layer = torch.nn.Linear(hidden_layer_dim, output_dim)
    self.loss = torch.nn.NLLLoss()

  def forward(self, input_ids, attention_mask, token_ids):

    #Write the code for forward pass here

  def training_step(self, batch, idx):

    albert_pred = self(batch[0], batch[1], batch[2])
    albert_loss = self.loss(albert_pred, batch[3].view(-1))
    return albert_loss

  def validation_step(self, batch, idx):

    albert_pred = self(batch[0], batch[1], batch[2])
    albert_loss = self.loss(albert_pred, batch[3].view(-1))
    return albert_loss

  def configure_optimizers(self):

    return torch.optim.Adam(self.parameters(),lr=1e-5)

In [29]:
cls = #Write the code to initialize the albert model

In [30]:
train_obj = pl.Trainer(max_epochs=5,gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [31]:
import os

In [32]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [33]:
train_obj.fit(cls.to(device),gen)

Missing logger folder: /content/drive/MyDrive/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type        | Params
----------------------------------------------------
0 | albert_mdl          | AlbertModel | 11.7 M
1 | dropout_layer       | Dropout     | 0     
2 | first_linear_layer  | Linear      | 590 K 
3 | second_linear_layer | Linear      | 2.3 K 
4 | loss                | NLLLoss     | 0     
----------------------------------------------------
12.3 M    Trainable params
0         Non-trainable params
12.3 M    Total params
49.106    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [34]:
validation_batch = next(iter(gen.val))

In [35]:
cls.to(device)

AlbertClassifier(
  (albert_mdl): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)

In [36]:
val_pred = cls(validation_batch[0], validation_batch[1], validation_batch[2])

In [37]:
val_pred_label = val_pred.data.max(1)[1].cpu().numpy()

In [38]:
val_gt_label = validation_batch[3].reshape(batch_size).cpu().numpy()

In [39]:
precision_score = sklearn.metrics.precision_score(val_gt_label,val_pred_label)

In [40]:
print(precision_score)

0.9615384615384616


In [41]:
recall_score = sklearn.metrics.recall_score(val_gt_label,val_pred_label)

In [42]:
print(recall_score)

0.9615384615384616


In [43]:
accuracy_score = sklearn.metrics.accuracy_score(val_gt_label,val_pred_label)

In [44]:
print(accuracy_score)

0.9375
