

# Jigsaw Rate Severity of Toxic Comments

In [1]:
!nvidia-smi

Thu Jan 27 00:44:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    44W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import pathlib
from pathlib import Path
from matplotlib import pyplot as plt

In [3]:
USERID = 'calpis10000'
EX_NO = 'jigsaw-calpis-001'
UPLOAD_DIR = Path('/content/model')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
out_path = Path(f"../output/{EX_NO}")
out_path.mkdir(parents=True, exist_ok=True)

In [5]:
# copy input
!cp -r -f "../input/" "."

In [6]:
# install librariess
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 71.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 73.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 78.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  A

In [7]:
# ----------------------------------------------
# Path
# ----------------------------------------------
import pathlib
from pathlib import Path
import sys

INPUT_DIR_0 = Path('./input/jigsaw-toxic-severity-rating/')
INPUT_DIR_1 = Path('./input/jigsaw-toxic-comment-classification-challenge/')
INPUT_DIR_2 = Path('./input/jigsaw-unintended-bias-in-toxicity-classification/')


# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

import gc
gc.enable()


# ----------------------------------------------
# Set Globals
# ----------------------------------------------
FOLDS = 5
BATCH_SIZE = 32
NUM_EPOCHS = 3
MAX_LEN = 128
LEANING_RATE = 1e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # cudaがなければcpuを使えばいいじゃない
DEBUG = False
TRAIN = True
RUN_VALID = True
TOKENIZER = 'roberta-large'
PRETRAINED = 'roberta-large'

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [8]:
# ----------------------------------------------
# Load Data
# ----------------------------------------------
# INPUT_0: This Competition
submission = pd.read_csv(INPUT_DIR_0/'sample_submission.csv')
val_data = pd.read_csv(INPUT_DIR_0/'validation_data.csv')
test = pd.read_csv(INPUT_DIR_0/'comments_to_score.csv')
print('load data: this competition')

# INPUT_1: 1st Competition
train_1st = pd.read_csv(INPUT_DIR_1/'train.csv')
test_1st = pd.read_csv(INPUT_DIR_1/'test.csv')
test_labels_1st = pd.read_csv(INPUT_DIR_1/'test_labels.csv')
print('load data: 1st competition')

# INPUT_2: 2nd Competition
#train_2nd = pd.read_csv(INPUT_DIR_2/'train.csv')
#test_2nd = pd.read_csv(INPUT_DIR_2/'test.csv')
#idt_indiv_anno = pd.read_csv(INPUT_DIR_2/'identity_individual_annotations.csv')
#tox_indiv_anno = pd.read_csv(INPUT_DIR_2/'toxicity_individual_annotations.csv')
#print('load data: 2nd competition')


# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
SEED = 2021
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(SEED)

load data: this competition
load data: 1st competition


In [9]:
# ----------------------------------------------
# Sampling: Train_1st
# ----------------------------------------------
toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

test_1st_l = pd.merge(test_1st, test_labels_1st, on='id', how='left').query("toxic != -1")
train_src = pd.concat([train_1st, test_1st_l], axis='rows')
train_src['target'] = train_src[toxic_cols].values.max(axis=1)

sample_num = (train_src['target'] > 0).sum()

train_tg0 = train_src[train_src['target'] == 0].sample(int(sample_num*1.0), random_state=SEED)
train_tg1 = train_src[train_src['target'] > 0]
train = pd.concat([train_tg0, train_tg1], axis='rows')

val_comment_unq = pd.concat([val_data['less_toxic'], val_data['more_toxic']]).unique()
duplicate_idx = np.isin(train['comment_text'], val_comment_unq)
train = train.iloc[~duplicate_idx]
print('Sampling: Train')
print(f'shape: {train.shape}')

Sampling: Train
shape: (41003, 9)


In [10]:
train.target.value_counts()

0    22025
1    18978
Name: target, dtype: int64

In [11]:
# ----------------------------------------------
# Create Tokenizer
# ----------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
print('create: tokenizer')


# ----------------------------------------------
# Preprocess func
# ----------------------------------------------
# Preprocess
import string
import re
import collections
from bs4 import BeautifulSoup
import nltk
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

# https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

create: tokenizer


In [12]:
# ----------------------------------------------
# Dataset Class
# ----------------------------------------------
class Jigsaw1stDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__
        
        self.df = df
        self.inference_only = inference_only
        
        if not self.inference_only:
            self.target = torch.tensor(df[toxic_cols].values, dtype=torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            text_normalization(df['comment_text']).tolist(),
            padding='max_length',
            max_length=MAX_LEN,
            truncation=True,
            return_attention_mask=True
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask
                    }
        else:
            target = self.target[index]
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask, 
                    'target': target}


# ----------------------------------------------
# Model Class
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class Jigsaw1stModel(nn.Module):
    def __init__(self):
        super().__init__()
        config = AutoConfig.from_pretrained(PRETRAINED)
        self.pre_model = AutoModel.from_pretrained(PRETRAINED)
        self.head = AttentionHead(config.hidden_size, config.hidden_size,1)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(config.hidden_size, 6)
    
    def forward(self, input_ids, attention_mask):
        pre_out = self.pre_model(input_ids=input_ids, attention_mask=attention_mask)
        x0 = pre_out['last_hidden_state']
        x1 = self.head(x0)
        x2 = self.dropout(x1)
        x3 = self.regressor(x2)
        return x3

In [13]:
#j_ds = Jigsaw1stDataset(train)
#model = Jigsaw1stModel().to(DEVICE)

In [14]:
#output = model(j_ds[:2]['input_ids'].to(DEVICE), j_ds[:2]['attention_mask'].to(DEVICE))

In [15]:
#output

In [16]:
# ----------------------------------------------
# func: valid, predict
# ----------------------------------------------
def valid_mse(model, dataloader):
    model.eval()
    mse_sum = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            
            mse_sum = nn.MSELoss(reduction='sum')(output.flatten(), target).item()
            
    return mse_sum/(len(dataloader.dataset))


def valid_bce(model, dataloader):
    model.eval()
    score_sum = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            score_sum += nn.BCELoss(reduction='sum')(output.flatten(), target).item()
            
    return score_sum/(len(dataloader.dataset))

def valid_bcelogit(model, dataloader):
    model.eval()
    score_sum = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            score_sum += nn.BCEWithLogitsLoss(reduction='sum')(output, target).item()
            
    return score_sum/(len(dataloader.dataset)*target.shape[1])

def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), 6))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            result[idx:idx + output.shape[0], :] = output.to('cpu')
            
            idx += output.shape[0]
            
    return result


# ----------------------------------------------
# func: train
# ----------------------------------------------
def train_fn(
    model,
    save_path,
    train_loader,
    val_loader,
    optimizer,
    scheduler=None,
    num_epochs=NUM_EPOCHS
):

    best_score = np.inf
    best_epoch = 0
    log_interval = 100 # TODO: 冒頭で設定する

    start = time.time()

    for epoch in range(num_epochs):
        val_score = None

        for batch_idx, data in enumerate(train_loader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)

            optimizer.zero_grad()
            model.train()

            output = model(input_ids, attention_mask)
            loss = nn.BCEWithLogitsLoss()(output, target)

            loss.backward()
            optimizer.step()

            if scheduler:
                scheduler.step()

            if (batch_idx > 0) & (batch_idx % log_interval == 0):
                val_score = valid_bcelogit(model, val_loader)
                print(f"Epoch {epoch+1}, Step {batch_idx+1}, train_loss: {loss:0.5f}, val_loss: {val_score:0.5f}")
                if val_score < best_score:
                    print(f"Model Inproved: {best_score} ----> {val_score}")
                    best_score = val_score
                    torch.save(model.state_dict(), save_path)

            del input_ids
            del attention_mask
            del target
            del output
            torch.cuda.empty_cache()

    print(f"elasped time: {time.time() - start: 0.3}")
    start = time.time()

    return best_score


# ----------------------------------------------
# func: create optimizer
# ----------------------------------------------
def create_optimizer(model):
    named_params = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optim_params = []
    for idx_, (name_, params_) in enumerate(named_params):
        weight_decay = 0 if name_ in no_decay else 0.01
        optim_params.append({'params':params_,
                            'weight_decay': weight_decay,
                            #'lr': 1e-5
                            })

    return AdamW(optim_params)

In [17]:
# ----------------------------------------------
# Main Loop
# ----------------------------------------------
if TRAIN:
    val_scores = []
    from sklearn.model_selection import KFold
    kfold = KFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

    for fold, (train_idx, val_idx) in enumerate(kfold.split(train)): 
        print(f"*** FOLD {fold+1} / {FOLDS}***")

        save_path = f"/content/model/model_{fold+1}.pth"

        train_set = Jigsaw1stDataset(train.iloc[train_idx])
        valid_set = Jigsaw1stDataset(train.iloc[val_idx])

        train_loader = DataLoader(train_set,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                drop_last=True,
                                num_workers=2)
        valid_loader = DataLoader(valid_set,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                drop_last=False,
                                num_workers=2)

        model = Jigsaw1stModel().to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=LEANING_RATE)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_training_steps=NUM_EPOCHS*len(train_loader),
            num_warmup_steps=50
        )

        val_scores.append(
            train_fn(model, save_path, train_loader, valid_loader, optimizer, scheduler=scheduler)
        )

        del model
        torch.cuda.empty_cache()

        print(val_scores)
        print("Mean:", np.array(val_scores).mean())

*** FOLD 1 / 5***


Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Step 101, train_loss: 0.21463, val_loss: 0.18566
Model Inproved: inf ----> 0.18565587123729288
Epoch 1, Step 201, train_loss: 0.15968, val_loss: 0.15956
Model Inproved: 0.18565587123729288 ----> 0.15956111106766738
Epoch 1, Step 301, train_loss: 0.12397, val_loss: 0.14478
Model Inproved: 0.15956111106766738 ----> 0.14477552129345997
Epoch 1, Step 401, train_loss: 0.18689, val_loss: 0.15610
Epoch 1, Step 501, train_loss: 0.14095, val_loss: 0.14358
Model Inproved: 0.14477552129345997 ----> 0.14357956539773942
Epoch 1, Step 601, train_loss: 0.14462, val_loss: 0.13803
Model Inproved: 0.14357956539773942 ----> 0.13803072699452737
Epoch 1, Step 701, train_loss: 0.09218, val_loss: 0.15377
Epoch 1, Step 801, train_loss: 0.13090, val_loss: 0.14059
Epoch 1, Step 901, train_loss: 0.19561, val_loss: 0.14106
Epoch 1, Step 1001, train_loss: 0.20450, val_loss: 0.14214
Epoch 2, Step 101, train_loss: 0.09138, val_loss: 0.13782
Model Inproved: 0.13803072699452737 ----> 0.13782483334222112
Epoch

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Step 101, train_loss: 0.29180, val_loss: 0.18114
Model Inproved: inf ----> 0.1811438584208503
Epoch 1, Step 201, train_loss: 0.13903, val_loss: 0.17233
Model Inproved: 0.1811438584208503 ----> 0.17232830025474022
Epoch 1, Step 301, train_loss: 0.09819, val_loss: 0.15589
Model Inproved: 0.17232830025474022 ----> 0.1558879045169586
Epoch 1, Step 401, train_loss: 0.10795, val_loss: 0.14489
Model Inproved: 0.1558879045169586 ----> 0.14489325632924197
Epoch 1, Step 501, train_loss: 0.14878, val_loss: 0.14040
Model Inproved: 0.14489325632924197 ----> 0.14040032844739594
Epoch 1, Step 601, train_loss: 0.20187, val_loss: 0.13663
Model Inproved: 0.14040032844739594 ----> 0.1366293041976357
Epoch 1, Step 701, train_loss: 0.12488, val_loss: 0.15726
Epoch 1, Step 801, train_loss: 0.17942, val_loss: 0.13429
Model Inproved: 0.1366293041976357 ----> 0.1342884589018281
Epoch 1, Step 901, train_loss: 0.12926, val_loss: 0.13612
Epoch 1, Step 1001, train_loss: 0.14740, val_loss: 0.13668
Epoch 2,

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Step 101, train_loss: 0.17725, val_loss: 0.17646
Model Inproved: inf ----> 0.1764550816405669
Epoch 1, Step 201, train_loss: 0.15695, val_loss: 0.19812
Epoch 1, Step 301, train_loss: 0.11968, val_loss: 0.16225
Model Inproved: 0.1764550816405669 ----> 0.16224818410308642
Epoch 1, Step 401, train_loss: 0.15798, val_loss: 0.15140
Model Inproved: 0.16224818410308642 ----> 0.1514015944150484
Epoch 1, Step 501, train_loss: 0.08634, val_loss: 0.14084
Model Inproved: 0.1514015944150484 ----> 0.14084188569800007
Epoch 1, Step 601, train_loss: 0.16986, val_loss: 0.14607
Epoch 1, Step 701, train_loss: 0.12018, val_loss: 0.14214
Epoch 1, Step 801, train_loss: 0.11741, val_loss: 0.14623
Epoch 1, Step 901, train_loss: 0.12355, val_loss: 0.13975
Model Inproved: 0.14084188569800007 ----> 0.13975296448639862
Epoch 1, Step 1001, train_loss: 0.17958, val_loss: 0.13402
Model Inproved: 0.13975296448639862 ----> 0.1340237859787507
Epoch 2, Step 101, train_loss: 0.12139, val_loss: 0.13948
Epoch 2, S

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Step 101, train_loss: 0.21068, val_loss: 0.16419
Model Inproved: inf ----> 0.16418881987653128
Epoch 1, Step 201, train_loss: 0.21711, val_loss: 0.15035
Model Inproved: 0.16418881987653128 ----> 0.15035124030297364
Epoch 1, Step 301, train_loss: 0.18702, val_loss: 0.15429
Epoch 1, Step 401, train_loss: 0.19129, val_loss: 0.14626
Model Inproved: 0.15035124030297364 ----> 0.14625648929336207
Epoch 1, Step 501, train_loss: 0.09235, val_loss: 0.14193
Model Inproved: 0.14625648929336207 ----> 0.14193191853238316
Epoch 1, Step 601, train_loss: 0.18864, val_loss: 0.13406
Model Inproved: 0.14193191853238316 ----> 0.13405966470881206
Epoch 1, Step 701, train_loss: 0.11372, val_loss: 0.14358
Epoch 1, Step 801, train_loss: 0.13997, val_loss: 0.13498
Epoch 1, Step 901, train_loss: 0.13774, val_loss: 0.13318
Model Inproved: 0.13405966470881206 ----> 0.13317955645603863
Epoch 1, Step 1001, train_loss: 0.13184, val_loss: 0.14831
Epoch 2, Step 101, train_loss: 0.16567, val_loss: 0.13004
Model

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1, Step 101, train_loss: 0.16076, val_loss: 0.18052
Model Inproved: inf ----> 0.1805243036320539
Epoch 1, Step 201, train_loss: 0.16575, val_loss: 0.15898
Model Inproved: 0.1805243036320539 ----> 0.15897566652394893
Epoch 1, Step 301, train_loss: 0.10934, val_loss: 0.14901
Model Inproved: 0.15897566652394893 ----> 0.1490100239447462
Epoch 1, Step 401, train_loss: 0.10063, val_loss: 0.14392
Model Inproved: 0.1490100239447462 ----> 0.14392138160220005
Epoch 1, Step 501, train_loss: 0.08678, val_loss: 0.14755
Epoch 1, Step 601, train_loss: 0.12683, val_loss: 0.15227
Epoch 1, Step 701, train_loss: 0.17480, val_loss: 0.14562
Epoch 1, Step 801, train_loss: 0.16728, val_loss: 0.14335
Model Inproved: 0.14392138160220005 ----> 0.1433517573458877
Epoch 1, Step 901, train_loss: 0.13331, val_loss: 0.14049
Model Inproved: 0.1433517573458877 ----> 0.14048946482136968
Epoch 1, Step 1001, train_loss: 0.16676, val_loss: 0.13846
Model Inproved: 0.14048946482136968 ----> 0.13846309527633635
Epoch 2

In [21]:
def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), 6))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            result[idx:idx + output.shape[0], :] = output.to('cpu')
            
            idx += output.shape[0]
            
    return result

# Predict or Load Valid data

In [23]:
model_path = UPLOAD_DIR
models = sorted([str(i) for i in list(model_path.iterdir())])[1:]
print(models)

['/content/model/model_1.pth', '/content/model/model_2.pth', '/content/model/model_3.pth', '/content/model/model_4.pth', '/content/model/model_5.pth']


In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler_ = MinMaxScaler()
oof_sc = scaler_.fit_transform(oof_preds)

In [25]:
# val: less, more別々にdf作成 -> スコアを予測し、「more > less」である率を計測する。
val_less = val_data[['less_toxic']].rename({'less_toxic': 'comment_text'}, axis='columns')
val_more = val_data[['more_toxic']].rename({'more_toxic': 'comment_text'}, axis='columns')

In [26]:
def calc_val(model, model_path, less_dataset, more_dataset):
    less_pred = np.zeros((FOLDS, len(less_dataset), 6))
    more_pred = np.zeros((FOLDS, len(more_dataset), 6))

    less_loader = DataLoader(less_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=2)
    more_loader = DataLoader(more_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=2)

    for i, model_ in enumerate(model_path):
        print(f"model-{i}: start")

        model.to(DEVICE)
        model.load_state_dict(torch.load(model_))

        less_pred[i, :] = predict(model, less_loader)
        more_pred[i, :] = predict(model, more_loader)
        print(f"model-{i}: complete")

    less_mean = less_pred.mean(axis=0)
    #less_mean = scaler_.transform(less_mean)
    more_mean = more_pred.mean(axis=0)
    #more_mean = scaler_.transform(more_mean)

    val_scores = pd.DataFrame({'worker': val_data['worker'].head(len(less_dataset)),
                               'less_score': less_mean.sum(axis=1),
                               'more_score': more_mean.sum(axis=1),})
    val_scores['score_diff'] = val_scores['more_score'] - val_scores['less_score']
    val_scores['correct_ans'] = val_scores['score_diff'] > 0

    acc = val_scores['correct_ans'].sum() / len(val_scores)

    print(f"accuracy: {acc}")
    print(f"{val_scores['correct_ans'].sum()} / {len(val_scores)}")
    return less_mean, more_mean, val_scores

In [28]:
if RUN_VALID:
  model = Jigsaw1stModel()
  less_dataset = Jigsaw1stDataset(val_less, inference_only=True)
  more_dataset = Jigsaw1stDataset(val_more, inference_only=True)

  less_, more_, scores = calc_val(model, models, less_dataset, more_dataset)
  pd.DataFrame(less_).to_csv(out_path/'less_df.csv', index=False)
  pd.DataFrame(more_).to_csv(out_path/'more_df.csv', index=False)
  scores.to_csv(out_path/'out_score.csv', index=False)
  scores.head()

else:
  less_df = pd.read_csv(out_path/'less_df.csv')
  more_df = pd.read_csv(out_path/'more_df.csv')

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model-0: start
model-0: complete
model-1: start
model-1: complete
model-2: start
model-2: complete
model-3: start
model-3: complete
model-4: start
model-4: complete
accuracy: 0.7031021655373987
21169 / 30108
