<a href="https://colab.research.google.com/github/cfsandu/MLProject/blob/main/ToxicComments2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import tensorflow as tf
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments


In [4]:
df = pd.read_csv('train.txt',
    #'C:/Users/Florin/toxic-comment-classification/train.csv/train.csv',
    nrows=9000,
    ##encoding='utf-8',
    sep='\t',
    names=["id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    )

In [5]:
LABEL_COLUMNS=["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
labels = LABEL_COLUMNS
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

train_df, val_df = train_test_split(df,test_size=0.05)
print(train_df.shape)

(8550, 8)


In [6]:
def strlist_to_intlist(lst):
  retlst = []
  for s in lst:
    try:
      intobj = int(s)
    except ValueError:
      intobj = 0
    retlst.append(intobj)

  return retlst

class ToxComDS(Dataset):
    def __init__(self,
                 data: pd.DataFrame,
                 tokenizer: BertTokenizer,
                 max_token_len:int = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx: int):
        dr = self.data.iloc[idx]
        com_txt = dr.comment_text
        labels = list(dr[LABEL_COLUMNS])
        labi = strlist_to_intlist(labels)  #int(i) for i in labels]
        #print(labels)
        #print(labi)

        encoding = self.tokenizer.encode_plus(
            com_txt,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return dict (
            comment_text = com_txt,
            input_ids = encoding['input_ids'].flatten(),
            attention_mask = encoding['attention_mask'].flatten(),
            labels = torch.FloatTensor(labi)
        )


In [7]:
#--TEST DS
BERT_MODEL_NAME = 'bert-base-uncased'#'bert-base-cased'
#tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)

train_dataset = ToxComDS(train_df, tokenizer)
val_dataset = ToxComDS(val_df, tokenizer)
test_dataset = ToxComDS(val_df, tokenizer)

sample_item = train_dataset[16]

print(sample_item.keys())
print(sample_item['labels'])
print(sample_item['input_ids'].shape)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

dict_keys(['comment_text', 'input_ids', 'attention_mask', 'labels'])
tensor([0., 0., 0., 0., 0., 0.])
torch.Size([128])


In [8]:
#--METRICS
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result
    

In [9]:
batch_size = 8
metric_name = "f1"

training_args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    ###metric_for_best_model=metric_name
    #push_to_hub=True,
)
model = AutoModelForSequenceClassification.from_pretrained(
    BERT_MODEL_NAME, 
    problem_type="multi_label_classification", 
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id)

#-----------TEST Clf(BERT_MODEL)---------------------
outputs = model(input_ids=train_dataset[16]['input_ids'].unsqueeze(0), labels=train_dataset[16]['labels'].unsqueeze(0))
print(outputs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

SequenceClassifierOutput(loss=tensor(0.7125, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0836,  0.2613,  0.0729, -0.2169, -0.1938,  0.1764]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [10]:
import numpy as np
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.06,0.058506,0.697143,0.791343,0.928889
2,0.0408,0.055685,0.724324,0.819419,0.926667


TrainOutput(global_step=2138, training_loss=0.0646525664904945, metrics={'train_runtime': 468.7145, 'train_samples_per_second': 36.483, 'train_steps_per_second': 4.561, 'total_flos': 1124840158156800.0, 'train_loss': 0.0646525664904945, 'epoch': 2.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.05116533488035202,
 'eval_f1': 0.617283950617284,
 'eval_roc_auc': 0.7683003408887028,
 'eval_accuracy': 0.92,
 'eval_runtime': 2.26,
 'eval_samples_per_second': 110.62,
 'eval_steps_per_second': 14.159,
 'epoch': 2.0}

In [None]:
text = train_dataset[2]['comment_text']
print(text)

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)
logits = outputs.logits
print(outputs.logits)
print(logits.shape)

In [25]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

[]
