In [12]:
model_name = "ai4bharat/indic-bert"


In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset, Dataset, ClassLabel
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
import pandas as pd

torch.cuda.empty_cache()

In [8]:
df = pd.read_csv('data.csv')

In [3]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [4]:
data_train = Dataset.from_pandas(train_data, preserve_index=False)
data_test = Dataset.from_pandas(test_data, preserve_index=False)

In [10]:
labels = ClassLabel(num_classes=4, names=['Offensive-Ind', 'Not-Offensive', 'Offensive-Group','Offensive-Untargetted'], names_file=None, id=None)


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

#labels = ClassLabel(num_classes=4, names=['Offensive-Ind', 'Not-Offensive', 'Offensive-Group','Offensive-Untargetted'], names_file=None, id=None)

def tokenize_function(examples):
    tokens =  tokenizer(examples["text"], padding=True, truncation=True, max_length=128)
    tokens['label'] = labels.str2int(examples['label'])
    return tokens

tokenized_train = data_train.map(tokenize_function, batched=True)
tokenized_test = data_test.map(tokenize_function, batched=True)

tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

NameError: name 'model_name' is not defined

In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
id2label = {idx:labels.int2str(idx) for idx in range(4)}
label2id = {v:k for k,v in id2label.items()}

In [27]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        #Feed inputs to model and extract logits
        outputs = model(**inputs)
        logits = outputs.get("logits")
        #Extract labels
        labels = inputs.get("labels")
        # Define loss function with class weights
        loss_func = nn.CrossEntropyLoss()
        # Compute loss
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss 

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                             num_labels=4,
                                                             id2label=id2label,
                                                             label2id=label2id)

loading configuration file https://huggingface.co/ai4bharat/indic-bert/resolve/main/config.json from cache at /home/sudheesh/.cache/huggingface/transformers/2d290a1a22a5f80e173def8b2f31f12d68a957542e6769ab06bfc3de06bc49f4.06ba3893e888d6ff1388c45cdbee1fb785542ae22b70ff159f55da323230a159
Model config AlbertConfig {
  "_name_or_path": "ai4bharat/indic-bert",
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 3,
  "gap_size": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "id2label": {
    "0": "Offensive-Ind",
    "1": "Not-Offensive",
    "2": "Offensive-Group",
    "3": "Offensive-Untargetted"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "Not-Offensive": 1,
    "Offensive-Group": 2,
    "Offensive-Ind": 0,
    "Offensive-Untargetted": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_em

In [29]:
def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    recall = recall_score(y_true=labels, y_pred=preds, average='weighted')
    precision = precision_score(y_true=labels, y_pred=preds, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=preds, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [30]:
batch_size = 16
epochs = 30
leaning_rate = 2e-5


# Log the training loss at each epoch
logging_steps = len(tokenized_train) // batch_size

training_args = TrainingArguments(output_dir='Indic-BERT',
                                    num_train_epochs=epochs,
                                    learning_rate=leaning_rate,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    weight_decay=0.01,
                                    evaluation_strategy="steps",
                                    logging_steps=logging_steps,
                                    fp16=True,
                                    eval_steps=250,
                                    load_best_model_at_end=True)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [31]:
trainer = WeightedLossTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

Using cuda_amp half precision backend


In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10088
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9480


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
250,No log,0.206996,0.961554,0.924586,0.961554,0.942707
500,No log,0.199482,0.961554,0.924586,0.961554,0.942707
750,0.246900,0.173212,0.961554,0.924586,0.961554,0.942707
1000,0.246900,0.176837,0.961554,0.924586,0.961554,0.942707
1250,0.246900,0.177243,0.961554,0.924586,0.961554,0.942707
1500,0.188100,0.175878,0.961554,0.924586,0.961554,0.942707
1750,0.188100,0.17951,0.959572,0.937353,0.959572,0.948317
2000,0.166100,0.205691,0.959968,0.930161,0.959968,0.944234
2250,0.166100,0.212143,0.957194,0.935823,0.957194,0.946366
2500,0.166100,0.199047,0.953627,0.938595,0.953627,0.945676


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2523
  Batch size = 32
Saving model checkpoint to Indic-BERT/checkpoint-500
Configuration saved in Indic-BERT/checkpoint-500/config.json
Model weights saved in Indic-BERT/checkpoint-500/pytorch_model.bin
tokenizer config file saved in Indic-BERT/checkpoint-500/tokenizer_config.json
Special tokens file saved in Indic-BERT/checkpoint-500/special_tokens_map.

TrainOutput(global_step=2750, training_loss=0.1702787101052024, metrics={'train_runtime': 402.0706, 'train_samples_per_second': 752.704, 'train_steps_per_second': 23.578, 'total_flos': 524714847240192.0, 'train_loss': 0.1702787101052024, 'epoch': 8.7})

In [1]:


from transformers import TextClassificationPipeline, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import  ClassLabel


df = pd.read_csv('data.csv')
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

model_name = "Indic-BERT/checkpoint-1500"
tokenizer = AutoTokenizer.from_pretrained(model_name)



labels = ClassLabel(num_classes=4, names=['Offensive-Ind', 'Not-Offensive', 'Offensive-Group','Offensive-Untargetted'], names_file=None, id=None)
id2label = {idx:labels.int2str(idx) for idx in range(4)}
label2id = {v:k for k,v in id2label.items()}

model =AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=4,id2label=id2label,label2id=label2id)


pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False)
y_pred = pipe(test_data['text'].tolist())
y_pred = [y_pred[i]["label"] for i in range(len(y_pred))]
y_true = test_data['label'].tolist()
print(classification_report(y_true, y_pred))



                       precision    recall  f1-score   support

        Not-Offensive       0.96      1.00      0.98      2426
      Offensive-Group       0.00      0.00      0.00        29
        Offensive-Ind       0.00      0.00      0.00        33
Offensive-Untargetted       0.00      0.00      0.00        35

             accuracy                           0.96      2523
            macro avg       0.24      0.25      0.25      2523
         weighted avg       0.92      0.96      0.94      2523



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
