# Install Libraries

In [1]:
%%capture
!pip install transformers

# Import Libraries

In [2]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import transformers
from transformers import AutoTokenizer, AutoModelForPreTraining

# Load Dataset

In [3]:
# @title Dataset Folder Location
dataset_folder_location = '/content/drive/MyDrive/B.Tech. Final Year Project/Offensive Language Detection/Datasets/m_dataset_21_9/dataset_12_10/'

In [4]:
# @title Dataset Wrapper
class TextDatasetWrapper(Dataset):
    def __init__(self, dataset_folder_location, dataset_type, text_col='text', label_col='hate'):
        self.df = pd.read_csv(dataset_folder_location+dataset_type+'.csv')
        self.__text_col__ = text_col
        self.__label_col__ = label_col
    def __getitem__(self, index):
        text = self.df[self.__text_col__][index]
        label = self.df[self.__label_col__][index]
        return text, label
    def __len__(self):
        return len(self.df)

In [5]:
# @title Load Dataset using Wrapper
df_train = TextDatasetWrapper(dataset_folder_location, dataset_type='train')
df_test = TextDatasetWrapper(dataset_folder_location, dataset_type='test')
df_val = TextDatasetWrapper(dataset_folder_location, dataset_type='val')

## Genarate Batches from Dataset

In [6]:
# @title Batch Size
batch_size=128

In [7]:
# @title Genarate Batches
df_train_batches = DataLoader(df_train, batch_size=batch_size, shuffle=False)
df_test_batches = DataLoader(df_test, batch_size=batch_size, shuffle=False)
df_val_batches = DataLoader(df_val, batch_size=batch_size, shuffle=False)

# Load Tokenizer and Transformer
Transformer used is [BanglaBERT](https://huggingface.co/csebuetnlp/banglabert).

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")
model = AutoModelForPreTraining.from_pretrained("csebuetnlp/banglabert")
model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

ElectraForPreTraining(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

# Check Dataset Details
Here we will find the number of text data having tokens more than a limit, there ids, and maximum token size.

In [10]:
def token_details(df, greater_than=256, max_tokens=False, limit_exceed_ids=False, print_ids_counts=False):
    max = 0
    count = 0
    ids = []
    for row in tqdm(df.itertuples(), total=df.count()['text'], leave=False):
        tokens = tokenizer(row.text, return_tensors="pt")
        if tokens['input_ids'].shape[1] > max:
            max = tokens['input_ids'].shape[1]
        if tokens['input_ids'].shape[1] >= 256:
            if print_ids_counts:
                print('Index: ', row.Index,
                      '\tNumber of Words: ', len(row.text.split()),
                      '\tNumber of Tokens: ', tokens['input_ids'].shape[1])
            ids.append(row.Index)
            count += 1
    result = (count,)
    if max_tokens:
        result = result + (max,)
    if limit_exceed_ids:
        result += (ids,)
    if len(result) == 1:
        result = count
    return result

In [11]:
# @title Details from Train Set
max_count, max_tokens = token_details(df_train.df, max_tokens=True)
print('Number of texts that have tokens more than limit:', max_count)
print('Maximum number of tokens in the dataset         :', max_tokens)

  0%|          | 0/63241 [00:00<?, ?it/s]

Number of texts that have tokens more than limit: 83
Maximum number of tokens in the dataset         : 848


In [12]:
# @title Details from Test Set
max_count, max_tokens = token_details(df_test.df, max_tokens=True)
print('Number of texts that have tokens more than limit:', max_count)
print('Maximum number of tokens in the dataset         :', max_tokens)

  0%|          | 0/18069 [00:00<?, ?it/s]

Number of texts that have tokens more than limit: 30
Maximum number of tokens in the dataset         : 775


In [13]:
# @title Details from Validation Set
max_count, max_tokens = token_details(df_val.df, max_tokens=True)
print('Number of texts that have tokens more than limit:', max_count)
print('Maximum number of tokens in the dataset         :', max_tokens)

  0%|          | 0/9035 [00:00<?, ?it/s]

Number of texts that have tokens more than limit: 10
Maximum number of tokens in the dataset         : 470


# Genarate Embeddings

In [14]:
def text_to_embedding_batches(df_batches):
    df_embeddings = []
    with torch.no_grad():
        for batch in tqdm(df_batches, desc='Genarating Embeddings'):
            tokenized_texts = tokenizer(batch[0], return_tensors="pt", padding='max_length', max_length=256, truncation=True)
            tokenized_texts = tokenized_texts.to(device)
            embedding_texts = model(**tokenized_texts)['logits']
            df_embeddings.append(embedding_texts)
            del tokenized_texts
            del embedding_texts
            torch.cuda.empty_cache()
    df_embeddings = torch.cat(df_embeddings, dim=0)
    return df_embeddings

In [15]:
# @title Embeddings of Train Set
df_train_embeddings = text_to_embedding_batches(df_train_batches)
df_train_embeddings.shape

Genarating Embeddings:   0%|          | 0/495 [00:00<?, ?it/s]

torch.Size([63241, 256])

In [16]:
# @title Embeddings of Test Set
df_test_embeddings = text_to_embedding_batches(df_test_batches)
df_test_embeddings.shape

Genarating Embeddings:   0%|          | 0/142 [00:00<?, ?it/s]

torch.Size([18069, 256])

In [17]:
# @title Embeddings of Validation Set
df_val_embeddings = text_to_embedding_batches(df_val_batches)
df_val_embeddings.shape

Genarating Embeddings:   0%|          | 0/71 [00:00<?, ?it/s]

torch.Size([9035, 256])

# Save the Embeddings

In [18]:
embedding_storage_location = '/content/drive/MyDrive/B.Tech. Final Year Project/Offensive Language Detection/Datasets/m_dataset_21_9/dataset_embeddings_12_10/'
if not os.path.exists(embedding_storage_location):
    os.makedirs(embedding_storage_location)
np.save(embedding_storage_location+'train.npy', df_train_embeddings.to('cpu').numpy())
np.save(embedding_storage_location+'test.npy', df_test_embeddings.to('cpu').numpy())
np.save(embedding_storage_location+'val.npy', df_val_embeddings.to('cpu').numpy())