# Script for training a model

## Imports

In [1]:
import pandas as pd
import wandb
import numpy as np
from datasets import load_metric
from datasets import Dataset
from transformers import (AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer)
from torch.utils.data import Dataset as D
import pickle



## Preparation for training

In [2]:
class MyDataset(D):
    def __init__(self, data_path):
        self.raw_data = pd.read_csv(data_path, sep='\t', index_col=0)
        
        data = pd.DataFrame()
        data['toxic'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['reference'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['translation']])
        data['normal'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['translation'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['reference']])
        data['toxic_reduction'] = abs(self.raw_data['ref_tox'] - self.raw_data['trn_tox'])
        self.data = data


    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

## Training

In [3]:
if __name__ == '__main__':
    
    # prepare the data
    df = pd.read_pickle("../../data/interim/text_dataset.pkl").data
    df = df[["toxic", "normal"]]
    dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2)
    
    
    # tokenize text
    checkpoint = "t5-small"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def preprocess_function(data):
        inputs = [l for l in data["toxic"]]
        targets = [l for l in data["normal"]]
        model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, is_split_into_words=True)
        return model_inputs

    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
    
    
    # initialize trainer and start training
    folder = '../../models/T5-small' 

    training_args = Seq2SeqTrainingArguments(
        output_dir=folder,
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        save_steps=100,
        save_total_limit=3,
        num_train_epochs=10,
        predict_with_generate=True,
        fp16=True,
        generation_max_length = 133,
        weight_decay=1e-6,
        gradient_accumulation_steps=4,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test'],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    wandb.login(key="secret")
    trainer.train()
    trainer.save_model('../../models/saved')
    
    # after you can choose either version from autosave, either version from saved folder

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

  0%|          | 0/463 [00:00<?, ?ba/s]

  0%|          | 0/116 [00:00<?, ?ba/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdr-zhur0[0m ([33mtttttttttt[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20231104_210118-7wi6i9y9[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mmild-eon-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/tttttttttt/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/tttttttttt/huggingface/runs/7wi6i9y9[0m
You're using a T5TokenizerFast tokenizer. Please note that w

Epoch,Training Loss,Validation Loss
0,3.2752,2.996546
1,3.1299,2.907479
2,3.0709,2.863309
4,3.0303,2.833433
4,3.0041,2.814165
5,2.9811,2.802497
6,2.9696,2.789715
8,2.9649,2.78359
8,2.9594,2.780123
9,2.9482,2.779049


