In [1]:
from transformers import BertForSequenceClassification, BertConfig, CharacterBertModel, \
CharacterBertForPreTraining, CharacterBertConfig, CharacterBertTokenizer
import torch
import os

In [2]:
model_config = CharacterBertConfig.from_pretrained('../data/character-bert')
model = CharacterBertForPreTraining(model_config)

In [3]:
# Load the actual checkpoint file
output_directory = "model"

checkpoint = torch.load(
    output_directory, map_location="cpu"
)

In [4]:
model.load_state_dict(checkpoint['model'], strict=True)

<All keys matched successfully>

In [5]:
model.save_pretrained('character-bert-hindi')

In [6]:
from transformers import set_seed

# set_seed(42)
set_seed(30)

In [7]:
#### LOADING BERT FOR CLASSIFICATION ####

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)  # binary classification
model = BertForSequenceClassification(config=config)

In [8]:
model.bert.embeddings.word_embeddings  # wordpiece embeddings

Embedding(30522, 768, padding_idx=0)

In [9]:
#### REPLACING BERT WITH CHARACTER_BERT ####

character_bert_model = CharacterBertModel.from_pretrained(
    "E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
model.bert = character_bert_model

Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
model.bert.embeddings.word_embeddings  # wordpieces are replaced with a CharacterCNN

CharacterCnn(
  (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
  (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
  (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
  (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
  (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
  (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
  (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
  (_highways): Highway(
    (_layers): ModuleList(
      (0-1): 2 x Linear(in_features=2048, out_features=4096, bias=True)
    )
  )
  (_projection): Linear(in_features=2048, out_features=768, bias=True)
)

In [11]:
from datasets import load_dataset

hate_speech_dataset = load_dataset("csv", data_files="..\datasets\Hindi Hate Speech.csv", split="train")

Found cached dataset csv (C:/Users/arifa/.cache/huggingface/datasets/csv/default-60cb38e7639cbd07/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [12]:
hate_speech_dataset

Dataset({
    features: ['Text', 'Label'],
    num_rows: 3654
})

In [13]:
hate_speech_dataset.set_format("pandas")

In [14]:
# get label counts for both classes
label_counts = hate_speech_dataset["Label"].value_counts()
num_labels = (len(label_counts.keys()))

In [15]:
label_counts

Label
HOF    1991
NOT    1663
Name: count, dtype: int64

In [16]:
max_token_length = max(hate_speech_dataset['Text'].str.len())
max_token_length

280

In [17]:
hate_speech_dataset.reset_format()

In [18]:
tokenizer = CharacterBertTokenizer(strip_accents=None, do_lower_case=None)

In [19]:
def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True, max_length=128)

In [20]:
from transformers import DataCollatorWithPadding

tokenized_dataset = hate_speech_dataset.map(tokenize_function, batched=True, remove_columns=["Text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-60cb38e7639cbd07\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-da7e04c595b2fd23.arrow


In [21]:
tokenized_dataset

Dataset({
    features: ['Label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3654
})

In [22]:
temp = tokenized_dataset.filter(lambda x:x if tokenizer.unk_token_id in x["input_ids"] else None)

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-60cb38e7639cbd07\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e326bae26cda2748.arrow


In [23]:
temp

Dataset({
    features: ['Label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 0
})

In [24]:
def assign_label(example):
    mapping = {"HOF":0, "NOT":1}
    example['Label'] = mapping[example['Label']]
    return example

In [25]:
tokenized_dataset = tokenized_dataset.map(assign_label)
tokenized_dataset = tokenized_dataset.rename_column("Label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

Loading cached processed dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-60cb38e7639cbd07\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-16a09898a64377b1.arrow


['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [26]:
samples = [tokenized_dataset[i] for i in range(10)]
samples

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] तीन दिन से हथेली और लण्ड दोनों में खुजली हो रही थी. न पैसा मिला न चूत. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] जिन'नहीं वो निहायत ही मादरचोद किस्म की सुअर प्रजाति के लोग हैं, जिनका DNA तेरे जैसे लोगों से मिलता - जुलता है । [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] सुंदरता सस्ती है. चरित्र महंगा है. घड़ी सस्ती है. समय महंगा है. शरीर सस्ता है. जीवन महंगा है. रिश्ता सस्ता है. लेकिन निभाना महंगा है. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] कभी चाल, कभी मकसद, कभी मंसूबे यार होते हैं. इस दौर के'नमस्ते'के भी मतलब हजार होते हैं. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] से याद आया, बरकी को काट खाओ, और कुत्ते से प्यार'दिखाओ '! कमाल करते हो पांडेजी! [SEP] [PAD] [PA

In [27]:
hate_speech_dataset[:10]

{'Text': ['\u200d तीन दिन से हथेली और लण्ड दोनों में खुजली हो रही थी . न पैसा मिला न चूत .',
  "जिन ' नहीं वो निहायत ही मादरचोद किस्म की सुअर प्रजाति के लोग हैं , जिनका DNA तेरे जैसे लोगों से मिलता - जुलता है ।",
  'सुंदरता सस्ती है . चरित्र महंगा है . घड़ी सस्ती है . समय महंगा है . शरीर सस्ता है . जीवन महंगा है . रिश्ता सस्ता है . लेकिन निभाना महंगा है .',
  "कभी चाल , कभी मकसद , कभी मंसूबे यार होते हैं . इस दौर के ' नमस्ते ' के भी मतलब हजार होते हैं .",
  "से याद आया , बरकी को काट खाओ , और कुत्ते से प्यार ' दिखाओ ' ! कमाल करते हो पांडेजी !",
  'कैसे मुकदमा लङेगी फातिमा रन्ङी तू अपनी तन्हाई का हर हिन्दु लन्ङ गवाह है तेरी मक्का मदीना मे चुदाई का अल्ला झूठा मुल्ला झूठा झूठी सारी खुदाई चुत मिले अगर मुल्ली की तो कर दो जम के चुदाई वैसमोल ने किया कमाल ।',
  "भगवा ' देश और क्रिकेट दोनों के लिए पनौती है! धत् ।",
  "दिल्ली में मूर्ति तोड़ने की बात थी वहां तो अपनी ' सेना ' लेकर पहुंच गए थे! जहां एक ' जिन्दा ' आदमी को तोड़ दिया गया , वहां कब जाओगे ? कोई नी ट्वीट कर दिए ना काम पूरा हुआ!",
  'जी ,

In [28]:
downsampled_dataset = tokenized_dataset.train_test_split(
    train_size=0.8, seed=42
)
downsampled_dataset

Loading cached split indices for dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-60cb38e7639cbd07\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-e6a71cf59bde7e9a.arrow and C:\Users\arifa\.cache\huggingface\datasets\csv\default-60cb38e7639cbd07\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-6bb16ea0f9858540.arrow


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2923
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 731
    })
})

In [29]:
from torch.utils.data import DataLoader
# batch_size = 16
batch_size = 32

train_dataloader = DataLoader(
    downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    downsampled_dataset["test"], batch_size=batch_size, collate_fn=data_collator
)

In [30]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([32]),
 'input_ids': torch.Size([32, 51, 50]),
 'token_type_ids': torch.Size([32, 51]),
 'attention_mask': torch.Size([32, 51])}

In [31]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7786, grad_fn=<NllLossBackward0>) torch.Size([32, 2])


In [32]:
from transformers import AdamW

# optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer = AdamW(model.parameters(), lr=3e-5)

In [33]:
from transformers import get_scheduler

num_epochs = 3
# num_epochs = 6
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

276


In [34]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [35]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/276 [00:00<?, ?it/s]

In [36]:
import evaluate

metric = evaluate.load("f1")
# results = f1_metric.compute(predictions=[0, 1], references=[0, 1], average="macro")
# print(results)

In [41]:
model.eval()
y_preds = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_preds.extend(predictions.cpu())
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute(average="macro")

{'f1': 0.8217022604157803}

In [42]:
y_true = downsampled_dataset["test"]["labels"]

In [43]:
from sklearn.metrics import classification_report
target_names = ["HOF", "NOT"]
print(classification_report(y_true, y_preds,target_names=target_names))

              precision    recall  f1-score   support

         HOF       0.84      0.84      0.84       404
         NOT       0.80      0.81      0.80       327

    accuracy                           0.82       731
   macro avg       0.82      0.82      0.82       731
weighted avg       0.82      0.82      0.82       731



In [72]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [73]:
scores = list()
import numpy as np
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold

from datasets import DatasetDict

# First make the kfold object
folds = StratifiedKFold(n_splits=5)

tokenized_dataset = tokenized_dataset.shuffle(seed=30)

# Now make our splits based off of the labels. 
# We can use `np.zeros()` here since it only works off of indices, we really care about the labels
splits = folds.split(np.zeros(tokenized_dataset.num_rows), tokenized_dataset["labels"])

# In this case I'm overriding the train/val/test
for train_idxs, val_idxs in splits:
    fold_dataset = DatasetDict({
    "train":tokenized_dataset.select(train_idxs),
    "validation":tokenized_dataset.select(val_idxs),
    })
    
    batch_size = 32
    #batch_size = 16

    train_dataloader = DataLoader(
        fold_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        fold_dataset["validation"], batch_size=batch_size, collate_fn=data_collator
    )
    
    #### LOADING BERT FOR CLASSIFICATION ####
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)  # binary classification
    model = BertForSequenceClassification(config=config)
    
    #### REPLACING BERT WITH CHARACTER_BERT ####

    character_bert_model = CharacterBertModel.from_pretrained(\
        "E:\Documents\Character Bert\Hate Speech\character-bert-hindi")
    model.bert = character_bert_model
    
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=3e-5)
    
    from transformers import get_scheduler

    #num_epochs = 6
    num_epochs = 3
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    f1_score = metric.compute(average="macro")
    scores.append(f1_score['f1'])
    print(f1_score)
    
    gc.collect()
    torch.cuda.empty_cache()

Loading cached shuffled indices for dataset at C:\Users\arifa\.cache\huggingface\datasets\csv\default-60cb38e7639cbd07\0.0.0\6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1\cache-50946bf913459348.arrow
Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel fr

276


  0%|          | 0/276 [00:00<?, ?it/s]

{'f1': 0.8287538728935238}


Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


276


  0%|          | 0/276 [00:00<?, ?it/s]

{'f1': 0.809804823602166}


Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


276


  0%|          | 0/276 [00:00<?, ?it/s]

{'f1': 0.8439028907152051}


Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


276


  0%|          | 0/276 [00:00<?, ?it/s]

{'f1': 0.8340708919174961}


Some weights of the model checkpoint at E:\Documents\Character Bert\Hate Speech\character-bert-hindi were not used when initializing CharacterBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing CharacterBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CharacterBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


276


  0%|          | 0/276 [00:00<?, ?it/s]

{'f1': 0.8220450924923325}


In [74]:
scores

[0.8287538728935238,
 0.809804823602166,
 0.8439028907152051,
 0.8340708919174961,
 0.8220450924923325]

In [76]:
sum(scores) / len(scores)

0.8277155143241446