### Import libraries

In [1]:
import os
import sys
import torch
from transformers import AutoTokenizer, AutoModel, AdamW

sys.path.append("D:/_jupyter/kaggle/")
from src.data.jigsaw_dataset import JigsawDataset
from src.utils.common import set_seed, get_hash_name

HASH_NAME = get_hash_name(size=12)

### Set W&B

In [2]:
import wandb
PROJECT_NAME = "jigsaw"
wandb.init(project=PROJECT_NAME)
anony = None

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbigshanedogg[0m (use `wandb login --relogin` to force relogin)


### Set CONFIG

In [3]:
CONFIG = {
    "seed": 20211115,
    "data_path": "../data/jigsaw-toxic-severity-rating/validation_data.csv",
    "encoding": "utf-8", 
    "extension": "csv",
    "epochs": 3,
    "model_name": "roberta-base",
    "train_batch_size": 32,
    "valid_batch_size": 32,
    "timesteps": 128,
    "learning_rate": 1e-4,
    "scheduler": 'CosineAnnealingLR',
    "min_lr": 1e-6,
    "T_max": 500,
    "weight_decay": 1e-6,
    "n_fold": 5,
    "n_accumulate": 1,
    "num_classes": 1,
    "margin": 0.5,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    "nprocs": 1, 
    "hash_name": HASH_NAME
}

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'

In [4]:
set_seed(CONFIG['seed'])
jigsaw_dataset = JigsawDataset(data_path=CONFIG["data_path"], tokenizer=CONFIG["tokenizer"], timesteps=CONFIG["timesteps"], batch_size=CONFIG["train_batch_size"], device=CONFIG["device"], nprocs=CONFIG["nprocs"], encoding=CONFIG["encoding"], extension=CONFIG["extension"])

Preprocessing data:   0%|                                                                    | 0/30108 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (743 > 512). Running this sequence through the model will result in indexing errors
Preprocessing data: 100%|██████████████████████████████████████████████████████| 30108/30108 [00:22<00:00, 1313.52it/s]


### Build Model

In [None]:
class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs