In [1]:
# the notebook's main objective is to filter and prepare the dataset to train a summarizer on it.
import os, sys
from pathlib import Path
HOME = os.getcwd()

current = HOME 
while 'src' not in os.listdir(current):
    current = Path(current).parent

PARENT_DIR = str(current)
DATA_FOLDER = os.path.join(PARENT_DIR, 'src','data')
data_path = os.path.join(DATA_FOLDER, 'filtered.tsv')

sys.path.append(str(current))
sys.path.append(os.path.join(str(current), 'data_analysis'))
sys.path.append(os.path.join(str(current), 'evaluation'))
sys.path.append(os.path.join(str(current), 'text_processing')) 

In [2]:
import pandas as pd
data = pd.read_csv(os.path.join(DATA_FOLDER, 'train.csv'), usecols= lambda x: x !='id', nrows=10000)
data.head()

# let's convert all the different sub toxicity-labels into a single label.
data['is_toxic'] = ((data['toxic'] + data['severe_toxic'] + data['obscene'] + data['threat'] + data['insult'] + data['identity_hate']) > 0).astype(int)
def prepare_data(row):
    row['is_toxic'] = int(row['toxic'] + row['severe_toxic'] + row['obcene'] + row['threat'] + row['insult'] + row['identity_hate'] > 0)
    return row 
# new_data = data.apply(prepare_data, axis='index')
new_data= data.drop(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate']).rename(columns={'comment_text': 'text'})
new_data['is_toxic'].value_counts()
new_data.to_csv(os.path.join(DATA_FOLDER, 'toxicity_data.csv'), index=False)

In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoModel, AutoTokenizer
# load tokenizer and model weights
toxic_tokenizer = RobertaTokenizer.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')
toxic_classifier = RobertaForSequenceClassification.from_pretrained('SkolkovoInstitute/roberta_toxicity_classifier')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
import datasets
data = datasets.load_dataset('csv', data_files=os.path.join(DATA_FOLDER, 'toxicity_data.csv'), split='train')

import torch
from transformers import AutoTokenizer, BartForSequenceClassification, AutoModelForSequenceClassification

checkpoint = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# freeze the entire model but the classification head
for n, p in model.named_parameters():
    if n not in ["classification_head.out_proj.bias", 
                 'classification_head.dense.weight', 
                 'classification_head.dense.bias', 
                 'classification_head.out_proj.weight']:
        
        p.requires_grad = False
    else:
        print(n)

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 7557.30it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1299.35it/s]
Generating train split: 10000 examples [00:00, 187186.38 examples/s]
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


classification_head.dense.weight
classification_head.dense.bias
classification_head.out_proj.weight
classification_head.out_proj.bias


In [6]:
from typing import Dict
from torch.nn.functional import softmax
    
# def process_labels(batch: Dict, device: str):
#     model_input = toxic_tokenizer(batch['text'], return_tensors='pt', truncation=True, padding=True)  
#     model_input = {k: v.to(device) for k, v in model_input.items()}
#     toxic_classifier.to(device)
#     model_input['label'] = softmax(toxic_classifier(**model_input).logits, dim=1)
#     return model_input

# DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# d = data.map(lambda b : process_labels(b, device=DEVICE), batched=True, batch_size=4)

# def process_data(batch: Dict):
#     model_input = tokenizer(batch['text'], truncation=True)
#     model_input['label'] = batch['label']
#     return model_input

# d = d.map(lambda b : process_labels(b, device=DEVICE), batched=True, batch_size=4).remove_columns(['is_toxic', 'text'])

In [7]:
# use the data collator
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# loader = DataLoader(d, batch_size=4, shuffle=True, collate_fn=data_collator)
# next(iter(loader))

In [8]:
d  = datasets.load_from_disk(os.path.join(DATA_FOLDER, f'_toxicity_data_{10000}'))

In [9]:
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=1)

In [10]:
from torch import nn
l = nn.CrossEntropyLoss()

In [11]:
from torch import nn
from transformers import Trainer


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        loss = nn.CrossEntropyLoss()(logits, labels)
        return (loss, outputs) if return_outputs else loss 

In [12]:
from transformers import Trainer
from torch import nn 


trainer = CustomTrainer(
    model,
    training_args,
    train_dataset=d,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

from src.training_utilities.pytorch_utilities import cleanup
cleanup()

In [13]:
trainer.train()

  0%|          | 0/1250 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 40%|████      | 500/1250 [01:28<01:52,  6.66it/s]

{'loss': 0.2649, 'learning_rate': 3e-05, 'epoch': 0.4}


 80%|████████  | 1000/1250 [02:59<00:43,  5.70it/s]

{'loss': 0.199, 'learning_rate': 1e-05, 'epoch': 0.8}


100%|██████████| 1250/1250 [03:45<00:00,  5.53it/s]

{'train_runtime': 225.8387, 'train_samples_per_second': 44.279, 'train_steps_per_second': 5.535, 'train_loss': 0.2230909912109375, 'epoch': 1.0}





TrainOutput(global_step=1250, training_loss=0.2230909912109375, metrics={'train_runtime': 225.8387, 'train_samples_per_second': 44.279, 'train_steps_per_second': 5.535, 'train_loss': 0.2230909912109375, 'epoch': 1.0})

In [None]:
model