

# Jigsaw Rate Severity of Toxic Comments

In [1]:
!nvidia-smi

Sun Feb  6 15:07:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# install librariess
!pip install transformers -q
!pip install sentencepiece -q

[K     |████████████████████████████████| 3.5 MB 15.7 MB/s 
[K     |████████████████████████████████| 596 kB 68.2 MB/s 
[K     |████████████████████████████████| 6.8 MB 53.6 MB/s 
[K     |████████████████████████████████| 895 kB 63.8 MB/s 
[K     |████████████████████████████████| 67 kB 6.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 13.3 MB/s 
[?25h

In [3]:
# ----------------------------------------------
# Load Libraries
# ----------------------------------------------
import pathlib
from pathlib import Path
import sys
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import BertForSequenceClassification, BertConfig, BertModel
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split

import gc
gc.enable()

In [28]:
# ----------------------------------------------
# Config
# ----------------------------------------------
COLAB = True
DEBUG = False
TRAIN = True
RUN_VALID = True

USERID = 'calpis10000'
EX_NO = 'jigsaw-calpis-012'
UPLOAD_DIR = Path('/content/model')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)

COLAB_BASE_DIR = Path(f"../")
OUT_DIR = COLAB_BASE_DIR/f"output/{EX_NO}"
COLAB_INPUT_DIR = COLAB_BASE_DIR/"input"

INPUT_BASE = Path('./') if COLAB else Path('../')
INPUT_DIR_0 = INPUT_BASE/'input/jigsaw-toxic-severity-rating/'
INPUT_DIR_1 = INPUT_BASE/'input/PuseudoLabelingJigsaw/jigsaw-toxic-comment-classification-challenge/'
INPUT_DIR_2 = INPUT_BASE/'input/PuseudoLabelingJigsaw/jigsaw-unintended-bias-in-toxicity-classification/'
INPUT_DIR_R = INPUT_BASE/'input/PuseudoLabelingJigsaw/ruddit-dataset/'

FOLDS = 5
NUM_CLASSES = 1
NUM_EPOCHS = 6
BATCH_SIZE = 32
BATCH_SIZE_PRED = 512
MAX_LEN = 128
LEANING_RATE = 2e-5
LOG_INTERVAL = 100
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # cudaがなければcpuを使えばいいじゃない
PRETRAINED = 'unitary/multilingual-toxic-xlm-roberta'
TOKENIZER = PRETRAINED

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [31]:
if COLAB:
  OUT_DIR.mkdir(parents=True, exist_ok=True)
  !cp -r -f {str(COLAB_INPUT_DIR)}/ "."

In [6]:
# ----------------------------------------------
# Load Data
# ----------------------------------------------
# INPUT_0: This Competition
submission = pd.read_csv(INPUT_DIR_0/'sample_submission.csv')
val_data = pd.read_csv(INPUT_DIR_0/'validation_data.csv')
test = pd.read_csv(INPUT_DIR_0/'comments_to_score.csv')
print('load data: this competition')

# INPUT_1: 1st Competition
#train_1st = pd.read_csv(INPUT_DIR_1/'train.csv')
#test_1st = pd.read_csv(INPUT_DIR_1/'test.csv')
#test_labels_1st = pd.read_csv(INPUT_DIR_1/'test_labels.csv')
#print('load data: 1st competition')

# INPUT_2: 2nd Competition
#train_2nd = pd.read_csv(INPUT_DIR_2/'train.csv')
#test_2nd = pd.read_csv(INPUT_DIR_2/'test.csv')
#idt_indiv_anno = pd.read_csv(INPUT_DIR_2/'identity_individual_annotations.csv')
#tox_indiv_anno = pd.read_csv(INPUT_DIR_2/'toxicity_individual_annotations.csv')
#print('load data: 2nd competition')

# INPUT_R: Ruddit Competition
#ruddit = pd.read_csv('/content/input/PuseudoLabelingJigsaw/rudddit-dataset/PseudoLabelDataset (1).csv')
#print('load data: ruddit competition')

# twitter_pseudo_label
twitter = pd.read_csv(INPUT_BASE'/input/PuseudoLabelingJigsaw/toxic-twitter-dataset/PseudoLabelDataset (2).csv')
print('load data: twitter')

# 1st_pseudo_label
#train_1st_pseudo = pd.read_csv('/content/input/PuseudoLabelingJigsaw/jigsaw-toxic-comment-classification-challenge/PseudoLabelDataset.csv')
#print('load data: 1st_pseudo_label')


# ----------------------------------------------
# Set SEED
# ----------------------------------------------
# seed
SEED = 2021
def set_seed(SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    
set_seed(SEED)

load data: this competition
load data: twitter


In [7]:
# ----------------------------------------------
# Sampling: Train
# ----------------------------------------------
train = twitter.rename(columns={'tweet':'comment_text', 'pseudo_label':'target'})
print(f'shape: {train.shape}')

shape: (56745, 4)


In [8]:
train.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,comment_text,target
0,0,0,@user when a father is dysfunctional and is s...,-0.06326
1,1,0,@user @user thanks for #lyft credit i can't us...,-0.62195
2,2,0,bihday your majesty,-0.131646
3,3,0,#model i love u take with u all the time in ...,-0.405339
4,4,0,factsguide: society now #motivation,-0.65809


In [9]:
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold

train['sc_grp'] = pd.cut(train['target'], 10)
grp_to_num = {v:n for n, v in enumerate(train['sc_grp'].unique().sort_values())}
train['sc_grp'] = train['sc_grp'].map(grp_to_num)

kf = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

for fold, (_, val_) in enumerate(kf.split(X=train, y=train['sc_grp'])):
  train.loc[val_, 'kfold'] = int(fold)

if DEBUG:
  train = train.sample(100)

In [10]:
train.groupby('kfold')['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
kfold,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,11349,-0.123472
1.0,11349,-0.123142
2.0,11349,-0.122904
3.0,11349,-0.124516
4.0,11349,-0.122794


In [11]:
train['kfold'].isnull().sum()

0

In [12]:
# ----------------------------------------------
# Create Tokenizer
# ----------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
print('create: tokenizer')


# ----------------------------------------------
# Preprocess func
# ----------------------------------------------
# Preprocess
import string
import re
import collections
from bs4 import BeautifulSoup
import nltk
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

# https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-train
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


def text_normalization(s:pd.Series):
    x = s.apply(text_cleaning)
    return x


Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

create: tokenizer


In [13]:
# ----------------------------------------------
# Dataset Class
# ----------------------------------------------
class TweetJigsawDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__
        
        self.df = df
        self.inference_only = inference_only
        
        if not self.inference_only:
            self.target = torch.tensor(df['target'].values, dtype=torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            #text_normalization(df['comment_text']).tolist(),
            df['comment_text'].tolist(),
            padding='max_length',
            max_length=MAX_LEN,
            truncation=True,
            return_attention_mask=True
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask
                    }
        else:
            target = self.target[index]
            return {'input_ids': input_ids,
                    'attention_mask': attention_mask, 
                    'target': target}


In [14]:
# ----------------------------------------------
# Model Class
# ----------------------------------------------
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class TweetJigsawModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained(PRETRAINED)
        self.config.attention_probs_dropout_prob = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.pre_model = AutoModel.from_pretrained(PRETRAINED, config=self.config)
        self.head = AttentionHead(self.config.hidden_size, self.config.hidden_size,1)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(self.config.hidden_size, NUM_CLASSES)
    
    def forward(self, input_ids, attention_mask):
        pre_out = self.pre_model(input_ids=input_ids, attention_mask=attention_mask)
        x0 = pre_out['last_hidden_state']
        x1 = self.head(x0)
        #x2 = self.dropout(x1)
        x3 = self.regressor(x1)
        return x3

In [15]:
#j_ds = TweetJigsawDataset(train)
#model = TweetJigsawModel().to(DEVICE)
#output = model(j_ds[:2]['input_ids'].to(DEVICE), j_ds[:2]['attention_mask'].to(DEVICE))

In [16]:
#j_ds[0:2]['target']

In [17]:
# ----------------------------------------------
# func: valid, predict
# ----------------------------------------------
def valid_mse(model, dataloader):
    model.eval()
    mse_sum = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            
            mse_sum += nn.MSELoss(reduction='sum')(output.flatten(), target).item()
            
    return mse_sum/(len(dataloader.dataset))


def valid_bce(model, dataloader):
    model.eval()
    score_sum = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            score_sum += nn.BCELoss(reduction='sum')(output.flatten(), target).item()
            
    return score_sum/(len(dataloader.dataset))

def valid_bcelogit(model, dataloader):
    model.eval()
    score_sum = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            score_sum += nn.BCEWithLogitsLoss(reduction='sum')(output, target).item()
            
    return score_sum/(len(dataloader.dataset)*target.shape[1])

def predict(model, dataloader):
    model.eval()
    result = np.zeros((len(dataloader.dataset), NUM_CLASSES))
    idx = 0
    
    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            
            output = model(input_ids, attention_mask)
            result[idx:idx + output.shape[0], :] = output.to('cpu')
            
            idx += output.shape[0]
            
    return result


# ----------------------------------------------
# func: train
# ----------------------------------------------
def train_fn(
    model,
    save_path,
    train_loader,
    val_loader,
    optimizer,
    scheduler=None,
    num_epochs=NUM_EPOCHS
):

    best_score = np.inf
    best_epoch = 0
    log_interval = LOG_INTERVAL

    start = time.time()

    for epoch in range(num_epochs):
        val_score = None

        for batch_idx, data in enumerate(train_loader):
            input_ids = data['input_ids'].to(DEVICE)
            attention_mask = data['attention_mask'].to(DEVICE)
            target = data['target'].to(DEVICE)

            optimizer.zero_grad()
            model.train()

            output = model(input_ids, attention_mask)
            loss = nn.MSELoss()(output.flatten(), target)

            loss.backward()
            optimizer.step()

            if scheduler:
                scheduler.step()

            if (batch_idx > 0) & (batch_idx % log_interval == 0):
                val_score = valid_mse(model, val_loader)
                print(f"Epoch {epoch+1}, Step {batch_idx+1}, train_loss: {loss:0.5f}, val_loss: {val_score:0.5f}")
                if val_score < best_score:
                    print(f"Model Inproved: {best_score} ----> {val_score}")
                    best_score = val_score
                    torch.save(model.state_dict(), save_path)

            del input_ids
            del attention_mask
            del target
            del output
            torch.cuda.empty_cache()

    print(f"elasped time: {time.time() - start: 0.3}")
    start = time.time()

    return best_score


# ----------------------------------------------
# func: create optimizer
# ----------------------------------------------
def create_optimizer(model):
    named_params = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optim_params = []
    for idx_, (name_, params_) in enumerate(named_params):
        weight_decay = 0 if name_ in no_decay else 0.01
        optim_params.append({'params':params_,
                            'weight_decay': weight_decay,
                            })

    return AdamW(optim_params)

In [18]:
# ----------------------------------------------
# func: Validation
# ----------------------------------------------
def calc_val_in_step(model, less_dataset, more_dataset):
    less_loader = DataLoader(less_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=2)
    more_loader = DataLoader(more_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=2)

    less_pred = predict(model, less_loader)
    more_pred = predict(model, more_loader)
    acc = (less_pred < more_pred).sum() / len(less_pred)
    print(f"accuracy: {acc}")
    print(f"{(less_pred < more_pred).sum()} / {len(less_pred)}")
    #return less_pred, more_pred

In [19]:
# ----------------------------------------------
# Main Loop
# ----------------------------------------------
if TRAIN:
    val_scores = []
    # val: less, more別々にdf作成 -> スコアを予測し、「more > less」である率を計測する。
    less_df_src = val_data[['less_toxic']].rename({'less_toxic': 'comment_text'}, axis='columns')
    more_df_src = val_data[['more_toxic']].rename({'more_toxic': 'comment_text'}, axis='columns')
    less_dataset = TweetJigsawDataset(less_df_src, inference_only=True)
    more_dataset = TweetJigsawDataset(more_df_src, inference_only=True)
    
    for fold in range(FOLDS): 
        print(f"*** FOLD {fold+1} / {FOLDS}***")

        save_path = f"/content/model/model_{fold+1}.pth"

        train_set = TweetJigsawDataset(train[train['kfold'] != fold])
        valid_set = TweetJigsawDataset(train[train['kfold'] == fold])

        train_loader = DataLoader(train_set,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                drop_last=True,
                                num_workers=2)
        valid_loader = DataLoader(valid_set,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                drop_last=False,
                                num_workers=2)

        model = TweetJigsawModel().to(DEVICE)
        optimizer = AdamW(model.parameters(), lr=LEANING_RATE)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_training_steps=NUM_EPOCHS*len(train_loader),
            num_warmup_steps=50
        )

        val_scores.append(
            train_fn(model, save_path, train_loader, valid_loader, optimizer, scheduler=scheduler)
        )
        calc_val_in_step(model, less_dataset, more_dataset)
        del model
        torch.cuda.empty_cache()

        print(val_scores)
        print("Mean:", np.array(val_scores).mean())

*** FOLD 1 / 5***


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch 1, Step 101, train_loss: 0.01709, val_loss: 0.01575
Model Inproved: inf ----> 0.015745844069304724
Epoch 1, Step 201, train_loss: 0.00414, val_loss: 0.00505
Model Inproved: 0.015745844069304724 ----> 0.005046741718373663
Epoch 1, Step 301, train_loss: 0.00338, val_loss: 0.00279
Model Inproved: 0.005046741718373663 ----> 0.002787921366135983
Epoch 1, Step 401, train_loss: 0.00137, val_loss: 0.00166
Model Inproved: 0.002787921366135983 ----> 0.001659630286720752
Epoch 1, Step 501, train_loss: 0.00108, val_loss: 0.00125
Model Inproved: 0.001659630286720752 ----> 0.001249553542986285
Epoch 1, Step 601, train_loss: 0.00114, val_loss: 0.00099
Model Inproved: 0.001249553542986285 ----> 0.000988085376114672
Epoch 1, Step 701, train_loss: 0.00074, val_loss: 0.00086
Model Inproved: 0.000988085376114672 ----> 0.0008552674404260519
Epoch 1, Step 801, train_loss: 0.00071, val_loss: 0.00077
Model Inproved: 0.0008552674404260519 ----> 0.0007716154000633327
Epoch 1, Step 901, train_loss: 0.00068

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch 1, Step 101, train_loss: 0.00376, val_loss: 0.00358
Model Inproved: inf ----> 0.0035820825292797527
Epoch 1, Step 201, train_loss: 0.00176, val_loss: 0.00184
Model Inproved: 0.0035820825292797527 ----> 0.001836736148446654
Epoch 1, Step 301, train_loss: 0.00094, val_loss: 0.00092
Model Inproved: 0.001836736148446654 ----> 0.0009230783168009508
Epoch 1, Step 401, train_loss: 0.00043, val_loss: 0.00066
Model Inproved: 0.0009230783168009508 ----> 0.000663347098658301
Epoch 1, Step 501, train_loss: 0.00082, val_loss: 0.00060
Model Inproved: 0.000663347098658301 ----> 0.0006037658615585482
Epoch 1, Step 601, train_loss: 0.00038, val_loss: 0.00055
Model Inproved: 0.0006037658615585482 ----> 0.0005485962506267786
Epoch 1, Step 701, train_loss: 0.00036, val_loss: 0.00041
Model Inproved: 0.0005485962506267786 ----> 0.00040991978042982457
Epoch 1, Step 801, train_loss: 0.00022, val_loss: 0.00035
Model Inproved: 0.00040991978042982457 ----> 0.00034683232986507665
Epoch 1, Step 901, train_lo

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch 1, Step 101, train_loss: 0.00294, val_loss: 0.00211
Model Inproved: inf ----> 0.0021132013743848503
Epoch 1, Step 201, train_loss: 0.00071, val_loss: 0.00108
Model Inproved: 0.0021132013743848503 ----> 0.0010774535995939512
Epoch 1, Step 301, train_loss: 0.00053, val_loss: 0.00071
Model Inproved: 0.0010774535995939512 ----> 0.0007070938868899315
Epoch 1, Step 401, train_loss: 0.00036, val_loss: 0.00049
Model Inproved: 0.0007070938868899315 ----> 0.0004914198983294899
Epoch 1, Step 501, train_loss: 0.00074, val_loss: 0.00048
Model Inproved: 0.0004914198983294899 ----> 0.00048483140175855705
Epoch 1, Step 601, train_loss: 0.00044, val_loss: 0.00058
Epoch 1, Step 701, train_loss: 0.00039, val_loss: 0.00036
Model Inproved: 0.00048483140175855705 ----> 0.00035576519789974305
Epoch 1, Step 801, train_loss: 0.00056, val_loss: 0.00047
Epoch 1, Step 901, train_loss: 0.00018, val_loss: 0.00034
Model Inproved: 0.00035576519789974305 ----> 0.0003439142684754963
Epoch 1, Step 1001, train_loss

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch 1, Step 101, train_loss: 0.00129, val_loss: 0.00337
Model Inproved: inf ----> 0.003372309498205953
Epoch 1, Step 201, train_loss: 0.00222, val_loss: 0.00145
Model Inproved: 0.003372309498205953 ----> 0.001448830841746779
Epoch 1, Step 301, train_loss: 0.00163, val_loss: 0.00110
Model Inproved: 0.001448830841746779 ----> 0.0010979645954048974
Epoch 1, Step 401, train_loss: 0.00065, val_loss: 0.00071
Model Inproved: 0.0010979645954048974 ----> 0.0007133790169080923
Epoch 1, Step 501, train_loss: 0.00035, val_loss: 0.00057
Model Inproved: 0.0007133790169080923 ----> 0.0005680551949889453
Epoch 1, Step 601, train_loss: 0.00022, val_loss: 0.00044
Model Inproved: 0.0005680551949889453 ----> 0.00043695815430614527
Epoch 1, Step 701, train_loss: 0.00038, val_loss: 0.00039
Model Inproved: 0.00043695815430614527 ----> 0.0003891122276802921
Epoch 1, Step 801, train_loss: 0.00040, val_loss: 0.00040
Epoch 1, Step 901, train_loss: 0.00025, val_loss: 0.00038
Model Inproved: 0.000389112227680292

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch 1, Step 101, train_loss: 0.00578, val_loss: 0.00770
Model Inproved: inf ----> 0.007702389064838481
Epoch 1, Step 201, train_loss: 0.00266, val_loss: 0.00203
Model Inproved: 0.007702389064838481 ----> 0.002027705932656924
Epoch 1, Step 301, train_loss: 0.00065, val_loss: 0.00115
Model Inproved: 0.002027705932656924 ----> 0.0011518586214268425
Epoch 1, Step 401, train_loss: 0.00090, val_loss: 0.00081
Model Inproved: 0.0011518586214268425 ----> 0.0008139503291008866
Epoch 1, Step 501, train_loss: 0.00065, val_loss: 0.00061
Model Inproved: 0.0008139503291008866 ----> 0.0006120846104489012
Epoch 1, Step 601, train_loss: 0.00046, val_loss: 0.00053
Model Inproved: 0.0006120846104489012 ----> 0.0005273945401956493
Epoch 1, Step 701, train_loss: 0.00022, val_loss: 0.00047
Model Inproved: 0.0005273945401956493 ----> 0.0004731725924917098
Epoch 1, Step 801, train_loss: 0.00031, val_loss: 0.00039
Model Inproved: 0.0004731725924917098 ----> 0.0003917792720326438
Epoch 1, Step 901, train_loss:

# Predict or Load Valid data

In [23]:
model_path = UPLOAD_DIR
models = sorted([str(i) for i in list(model_path.iterdir())])[1:]
print(models)

['/content/model/model_1.pth', '/content/model/model_2.pth', '/content/model/model_3.pth', '/content/model/model_4.pth', '/content/model/model_5.pth']


In [24]:
# val: less, more別々にdf作成 -> スコアを予測し、「more > less」である率を計測する。
val_less = val_data[['less_toxic']].rename({'less_toxic': 'comment_text'}, axis='columns')
val_more = val_data[['more_toxic']].rename({'more_toxic': 'comment_text'}, axis='columns')

In [25]:
def calc_val(model, model_path, less_dataset, more_dataset):
    less_pred = np.zeros((FOLDS, len(less_dataset), 6))
    more_pred = np.zeros((FOLDS, len(more_dataset), 6))

    less_loader = DataLoader(less_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=2)
    more_loader = DataLoader(more_dataset, batch_size=BATCH_SIZE,
                             drop_last=False, shuffle=False, num_workers=2)

    for i, model_ in enumerate(model_path):
        print(f"model-{i}: start")

        model.to(DEVICE)
        model.load_state_dict(torch.load(model_))

        less_pred[i, :] = predict(model, less_loader)
        more_pred[i, :] = predict(model, more_loader)
        print(f"model-{i}: complete")

    less_mean = less_pred.mean(axis=0)
    #less_mean = scaler_.transform(less_mean)
    more_mean = more_pred.mean(axis=0)
    #more_mean = scaler_.transform(more_mean)

    val_scores = pd.DataFrame({'worker': val_data['worker'].head(len(less_dataset)),
                               'less_score': less_mean.sum(axis=1),
                               'more_score': more_mean.sum(axis=1),})
    val_scores['score_diff'] = val_scores['more_score'] - val_scores['less_score']
    val_scores['correct_ans'] = val_scores['score_diff'] > 0

    acc = val_scores['correct_ans'].sum() / len(val_scores)

    print(f"accuracy: {acc}")
    print(f"{val_scores['correct_ans'].sum()} / {len(val_scores)}")
    return less_mean, more_mean, val_scores

In [32]:
if RUN_VALID:
  model = TweetJigsawModel()
  less_dataset = TweetJigsawDataset(val_less, inference_only=True)
  more_dataset = TweetJigsawDataset(val_more, inference_only=True)

  less_, more_, scores = calc_val(model, models, less_dataset, more_dataset)
  pd.DataFrame(less_).to_csv(OUT_DIR/'less_df.csv', index=False)
  pd.DataFrame(more_).to_csv(OUT_DIR/'more_df.csv', index=False)
  scores.to_csv(OUT_DIR/'out_score.csv', index=False)
  scores.head()

else:
  less_df = pd.read_csv(OUT_DIR/'less_df.csv')
  more_df = pd.read_csv(OUT_DIR/'more_df.csv')

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

model-0: start
model-0: complete
model-1: start
model-1: complete
model-2: start
model-2: complete
model-3: start
model-3: complete
model-4: start
model-4: complete
accuracy: 0.7141955626411585
21503 / 30108
