In [1]:
import sys
if 'google.colab' in sys.modules:  # colab特有の処理_2回目以降
  # Google Driveのマウント
  from google.colab import drive
  drive.mount('/content/drive')

  # ライブラリのパス指定
  sys.path.append('/content/drive/MyDrive/Colab_Files/kaggle/commonlit/XX_modules')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# データセットをDriveから取得
!mkdir -p 'input'
!mkdir -p 'clrp-pre-trained'

!cp -r '/content/drive/MyDrive/Colab_Files/kaggle/commonlit/00_input/commonlitreadabilityprize/' '/content/input'
!cp -r '/content/drive/MyDrive/Colab_Files/kaggle/commonlit/97_pre_trained/clrp_pretrained_manish_epoch5/pre-trained-roberta/clrp_roberta_large/' '/content/clrp-pre-trained'

In [3]:
from pathlib import Path

# input
if 'kaggle_web_client' in sys.modules:  # kaggle環境
    DATA_DIR = Path('../input/commonlitreadabilityprize/')

elif 'google.colab' in sys.modules: # Colab環境
    DATA_DIR = Path('/content/input/commonlitreadabilityprize')

else:
    DATA_DIR = Path('../00_input/commonlitreadabilityprize/')

In [4]:
from pathlib import Path

# tokenizer
if 'kaggle_web_client' in sys.modules:  # kaggle環境
    TOKENIZER_DIR = '../input/roberta-transformers-pytorch/roberta-large'
elif 'google.colab' in sys.modules: # Colab環境
    TOKENIZER_DIR = '/content/clrp-pre-trained/clrp_roberta_large' # 仮で、毎回DLする想定のモデル名を指定。あとで変更予定。
else:
    TOKENIZER_DIR = 'roberta-large'

In [5]:
from pathlib import Path

# pre-trained model
if 'kaggle_web_client' in sys.modules:  # kaggle環境
    PRE_TRAINED_MODEL_DIR = '../input/roberta-transformers-pytorch/roberta-large'
elif 'google.colab' in sys.modules: # Colab環境
    PRE_TRAINED_MODEL_DIR = '/content/clrp-pre-trained/clrp_roberta_large' # 仮で、毎回DLする想定のモデル名を指定。あとで変更予定。
else:
    PRE_TRAINED_MODEL_DIR = 'roberta-large'

In [6]:
UPLOAD_DIR = Path('/content/model')
EX_NO = '051-train-03'  # 実験番号などを入れる、folderのpathにする
USERID = 'calpis10000'

In [7]:
import subprocess
import shlex

def gpuinfo():
    """
    Returns size of total GPU RAM and used GPU RAM.

    Parameters
    ----------
    None

    Returns
    -------
    info : dict
        Total GPU RAM in integer for key 'total_MiB'.
        Used GPU RAM in integer for key 'used_MiB'.
    """

    command = 'nvidia-smi -q -d MEMORY | sed -n "/FB Memory Usage/,/Free/p" | sed -e "1d" -e "4d" -e "s/ MiB//g" | cut -d ":" -f 2 | cut -c2-'
    commands = [shlex.split(part) for part in command.split(' | ')]
    for i, cmd in enumerate(commands):
        if i==0:
            res = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        else:
            res = subprocess.Popen(cmd, stdin=res.stdout, stdout=subprocess.PIPE)
    total, used = map(int, res.communicate()[0].decode('utf-8').strip().split('\n'))
    info = {'total_MiB':total, 'used_MiB':used}
    return info


# Overview
This nb is based on copy from https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch .

Acknowledgments(from base nb): 
some ideas were taken from kernels by [Torch](https://www.kaggle.com/rhtsingh) and [Maunish](https://www.kaggle.com/maunish).

In [8]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW # optimizer
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup # scheduler
from pytorch_memlab import profile
import pytorch_memlab
from pytorch_memlab import MemReporter

from sklearn.model_selection import KFold, StratifiedKFold

import gc
gc.enable()

In [9]:
NUM_FOLDS = 5 # K Fold
NUM_EPOCHS = 8 # Epochs
BATCH_SIZE = 8 # Batch Size
MAX_LEN = 300 # ベクトル長
EVAL_SCHEDULE = [(0.55, 64), (-1., 32)] # schedulerの何らかの設定？
ROBERTA_PATH = PRE_TRAINED_MODEL_DIR # roberta pre-trainedモデル(モデルとして指定)
TOKENIZER_PATH = TOKENIZER_DIR # roberta pre-trainedモデル(Tokenizerとして指定)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # cudaがなければcpuを使えばいいじゃない

In [10]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True# cudnnによる最適化で結果が変わらないためのおまじない 

In [11]:
# train, testを読む
train_df = pd.read_csv(DATA_DIR/"train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv(DATA_DIR/"test.csv")
submission_df = pd.read_csv(DATA_DIR/"sample_submission.csv")

In [12]:
train_df.head()


Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [13]:
# tokenizerを指定
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [14]:
# Dataset用のClass。おそらく、trainとtestでインスタンスを生成し、DataFrameと同じように扱えるような思想。
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only # Testデータ用フラグ
        self.text = df.excerpt.tolist() # 分析対象カラムをlistにする。(分かち書きではなく、Seriesをlistへ変換するような処理)
        #self.text = [text.replace("\n", " ") for text in self.text] # 単語単位で分かち書きする場合
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32) # trainのみ、targetをtensorに変換
            self.standard_error = torch.tensor(df.standard_error.values, dtype=torch.float32) 

        self.encoded = tokenizer.batch_encode_plus( # textをtokenize
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True, # 最大長を超える文字は切り捨て
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index): # 変換結果を返す
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            standard_error = self.standard_error[index]
            return (input_ids, attention_mask, target, standard_error)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [15]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH) # pretrainedからconfigを読み込み
        config.update({"output_hidden_states":True, # config更新: embedding層を抽出
                       "hidden_dropout_prob": 0.0, # config更新: dropoutしない
                       "layer_norm_eps": 1e-7}) # config更新: layer normalizationのepsilon                      
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config) # cpuで処理する
            
        self.attention = nn.Sequential(# attentionレイヤー            
            nn.Linear(config.hidden_size, 512),      
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        
        self.layer_norm = nn.LayerNorm(config.hidden_size) # layer_normレイヤー
        
        self.regressor = nn.Sequential( # 出力レイヤー                    
            nn.Linear(config.hidden_size, 2)                        
        )

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids, # robertaに入力データを流し、出力としてrobertaモデル(layerの複合体)を得る
                                      attention_mask=attention_mask)     

        last_hidden_state = roberta_output.hidden_states[-1] # robertaモデルの最後のlayerを得る
        weights = self.attention(last_hidden_state) # robertaの最後のlayerをattentionへ入力し、出力として重みを得る                
        context_vector = torch.sum(weights * last_hidden_state, dim=1) # 重み×最後の層を足し合わせて文書ベクトルとする。
        norm_embeddings = self.layer_norm(context_vector)
        return self.regressor(norm_embeddings) # 文書ベクトルを線形層に入力し、targetを出力する

        # https://www.kaggle.com/rhtsingh/utilizing-transformer-representations-efficiently
        #last_hidden_state = roberta_output[0]
        #input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        #sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        #sum_mask = input_mask_expanded.sum(1)
        #sum_mask = torch.clamp(sum_mask, min=1e-9)
        #mean_embeddings = sum_embeddings / sum_mask

        
        # Now we reduce the context vector to the prediction score.
        #return self.regressor(mean_embeddings) # 文書ベクトルを線形層に入力し、targetを出力する

In [16]:
# 評価指標(MSE)の計算。最終的に、ルートしてRMSEにすると思われる。
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval() # evalモードを選択。Batch Normとかdropoutをしなくなる           
    mse_mean_sum = 0
    mse_std_sum = 0

    with torch.no_grad(): # 勾配の計算をしないBlock
        for batch_num, (input_ids, attention_mask, target, standard_error) in enumerate(data_loader): # data_loaderからinput, attentin_mask, targetをbatchごとに取り出す
            input_ids = input_ids.to(DEVICE)   
            attention_mask = attention_mask.to(DEVICE)   
            target = target.to(DEVICE)      
            standard_error = standard_error.to(DEVICE) 
            
            output = model(input_ids, attention_mask) # 取得した値をモデルへ入力し、出力として予測値を得る。

            mse_mean_sum += nn.MSELoss(reduction="sum")(output[:,0].flatten(), target).item() # 誤差の合計を得る(Batchごとに計算した誤差を足し上げる)
            mse_std_sum += nn.MSELoss(reduction="sum")(output[:,1].flatten(), target).item() # 誤差の合計を得る(Batchごとに計算した誤差を足し上げる)

    del input_ids
    del attention_mask
    del target

    mse_mean_result = mse_mean_sum / len(data_loader.dataset)
    mse_std_result = mse_std_sum / len(data_loader.dataset)
  
    return mse_mean_result, mse_std_result # 誤差の合計をdataset長で除し、mseを取得＆返す

In [17]:
# 推論結果を返す
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval() # evalモード(dropout, batch_normしない)

    result = np.zeros(len(data_loader.dataset)) # 結果をdataset長のzero配列として用意
    index = 0
    
    with torch.no_grad(): # 勾配の計算をしないblock(inputすると、現状の重みによる推論結果を返す)
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader): # data_loaderからbatchごとにinputを得る
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            output = model(input_ids, attention_mask) # modelにinputを入力し、予測結果を得る。

            result[index : index + output[:,0].shape[0]] = output[:,0].flatten().to("cpu") # result[index ~ predの長さ]へ、予測結果を格納
            index += pred.shape[0] # indexを更新

    return result # 全batchで推論が終わったら、結果を返す

In [18]:
# 学習
def train(model, # モデル
          model_path, # モデルのアウトプット先
          train_loader, # train-setのdata_loader
          val_loader, # valid-setのdata_loader
          optimizer, # optimizer
          scheduler=None, # scheduler, デフォルトはNone
          num_epochs=NUM_EPOCHS # epoch数、notebook冒頭で指定した値
         ):    
    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1] # eval期間(って何？) 冒頭で決めたEVAL_SCHEDULEの最初のtupleの[1]を取得

    start = time.time() # 時間計測用

    for epoch in range(num_epochs): # 指定したEpoch数だけ繰り返し
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target, standard_error) in enumerate(train_loader): # train_loaderからinput, targetを取得
            input_ids = input_ids.to(DEVICE) # inputをDEVICEへ突っ込む
            attention_mask = attention_mask.to(DEVICE)       
            target = target.to(DEVICE)
            standard_error = standard_error.to(DEVICE)  

            optimizer.zero_grad() # 勾配を初期化            
            model.train() # 学習モード開始

            output = model(input_ids, attention_mask) # input,attention_maskを入力し、予測結果を得る
            
            # by Yirun Zhang: https://www.kaggle.com/c/commonlitreadabilityprize/discussion/239421
            #p = torch.distributions.Normal(output[:,0], torch.sqrt(output[:,1]**2))
            #q = torch.distributions.Normal(target, standard_error)
            #kl_vector = torch.distributions.kl_divergence(p, q)
            #loss = kl_vector.mean()

            # by cccntu: https://www.kaggle.com/c/commonlitreadabilityprize/discussion/239421
            crit = torch.nn.GaussianNLLLoss()
            logits = output[:,0] # num_labels = 2
            standard_error = output[:,1]
            loss = crit(input=logits, target=target, var=standard_error ** 2) # var needs to be positive

            loss.backward() # 誤差逆伝播法により勾配を得る
            optimizer.step() # 重みを更新する

            if scheduler:
                scheduler.step() # schedulerが与えられた場合は、schedulerの学習率更新
            
            if step >= last_eval_step + eval_period: # batchを回すごとにstepを増やしていって、「前回evalしたstep + eval_period(16)」を超えたら実行。
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start # 経過時間
                num_steps = step - last_eval_step # 経過ステップ数
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step # 前回stepの更新
                
                # valid-setによるrmse計算
                train_mean_mse = nn.MSELoss(reduction="mean")(output[:,0].flatten(), target) 
                train_std_mse = nn.MSELoss(reduction="mean")(torch.sqrt(output[:,1]**2).flatten(), standard_error) 

                train_mean_rmse = math.sqrt(train_mean_mse)
                train_std_rmse = math.sqrt(train_std_mse)

                val_mean_mse, val_std_mse = eval_mse(model, val_loader)
                val_mean_rmse = math.sqrt(val_mean_mse)                            
                val_std_rmse = math.sqrt(val_std_mse)                            

                print(f"Epoch: {epoch} batch_num: {batch_num}")
                print(f"train_rmse_target: {train_mean_rmse:0.4}",
                      f"train_rmse_stderror: {train_std_rmse:0.4}",
                      f"train_kl_div: {loss:0.4}",
                      )
                print(f"val_rmse_target: {val_mean_rmse:0.4}",
                      f"val_rmse_stderror: {val_std_rmse:0.4}"
                      )

                for rmse, period in EVAL_SCHEDULE: # eval_periodをvalid-rmseで切り替える処理
                    if val_mean_rmse >= rmse: # valid rmseをEVAL_SCHEDULEと比較し、0項 > valid rmseとなるまで回す : EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
                        eval_period = period # eval_periodを更新
                        break                               

                if not best_val_rmse or val_mean_rmse < best_val_rmse: # 初回(best_val_rmse==None), またはbest_val_rmseを更新したらモデルを保存する
                    best_val_rmse = val_mean_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path) # 最高の自分を保存
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}", # 更新されない場合は、元のスコアを表示
                          f"(from epoch {best_epoch})")      
                                                  
                start = time.time()
            
            # batchごとにメモリ解放
            del input_ids
            del attention_mask
            del target
            torch.cuda.empty_cache()                                            
            step += 1
    
    return best_val_rmse

In [19]:
# optimizerの作成
def create_optimizer(model):
    named_parameters = list(model.named_parameters()) # モデルパラメータの取得
    
    roberta_parameters = list(model.roberta.named_parameters())[:-2] # パラメータをroberta用、attention用、regressor用に格納。(直接引っ張ってくる形式に変更)
    attention_parameters = list(model.attention.named_parameters())
    regressor_parameters = list(model.regressor.named_parameters())
    norm_parameters = list(model.layer_norm.named_parameters())

    attention_group = [params for (name, params) in attention_parameters] # attention用パラメータをリストとして取得
    regressor_group = [params for (name, params) in regressor_parameters] # reg用パラメータをリストとして取得
    norm_group = [params for (name, params) in norm_parameters] # reg用パラメータをリストとして取得

    parameters = []
    parameters.append({"params": attention_group}) # パラメータをリストに辞書として格納していく
    parameters.append({"params": regressor_group})
    parameters.append({"params": norm_group})

    for layer_num, (name, params) in enumerate(roberta_parameters): # レイヤーごとにname, paramsを取得していろんな処理
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 1e-5

        if layer_num >= 69:        
            lr = 2e-5

        if layer_num >= 133:
            lr = 5e-5

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters) # 最終的に、AdamWにパラメータを入力する。


In [20]:
# https://www.kaggle.com/abhishek/step-1-create-folds
def create_folds(data, num_splits, SEED, return_df=False):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins_tg"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    ).map(lambda x: str(x))

    # bin standard_error
    data.loc[:, "bins_std"] = pd.cut(
        data["standard_error"], bins=num_bins, labels=False
    )

    # bins
    data.loc[:, "bins"] = data['bins_tg'].map(lambda x: str(x)) + data['bins_std'].map(lambda x: str(x))

    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, random_state=SEED, shuffle=True)

    # note that, instead of targets, we use bins!
    if return_df:
      for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
      return data
    else:
      return kf.split(X=data, y=data.bins.values)

In [21]:
def train_and_save_model(train_indices, val_indices, model_path):
    train_dataset = LitDataset(train_df.loc[train_indices]) # train, validのDataset
    val_dataset = LitDataset(train_df.loc[val_indices])
        
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2) # train, validのDataLoader
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    

    model = LitModel().to(DEVICE) # modelをDEVICEへぶち込む
    optimizer = create_optimizer(model) # optimizerをモデルから作成
    scheduler = get_cosine_schedule_with_warmup( # schedulerを作成
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=50)    
    rmse = train(model, model_path, train_loader, val_loader, optimizer, scheduler=scheduler)

    del train_dataset
    del val_dataset
    del train_loader
    del val_loader
    del model
    del optimizer
    del scheduler
    gc.collect() 
    torch.cuda.empty_cache()
    return rmse

In [22]:
# 実行処理。 KFold & 学習
SEED = 1000
list_val_rmse = []

set_random_seed(SEED)
kfold = create_folds(train_df, NUM_FOLDS, SEED=SEED, return_df=False) # binsで切る場合

for fold, (train_indices, val_indices) in enumerate(kfold):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    print(gpuinfo())
    model_path = f"model_{fold + 1}.pth" # model_fold数_.pth
    set_random_seed(SEED + fold) # SEEDはfold別に変わるようにする
    list_val_rmse.append(train_and_save_model(train_indices, val_indices, model_path))

    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())
    print(gpuinfo())




Fold 1/5
{'total_MiB': 16280, 'used_MiB': 2}


Some weights of the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN thi


64 steps took 72.8 seconds
Epoch: 0 batch_num: 64
train_rmse_target: 1.159 train_rmse_stderror: 0.0 train_kl_div: 0.8463
val_rmse_target: 1.211 val_rmse_stderror: 3.217
New best_val_rmse: 1.211

64 steps took 71.8 seconds
Epoch: 0 batch_num: 128
train_rmse_target: 0.7985 train_rmse_stderror: 0.0 train_kl_div: 0.4679
val_rmse_target: 1.116 val_rmse_stderror: 2.59
New best_val_rmse: 1.116

64 steps took 71.8 seconds
Epoch: 0 batch_num: 192
train_rmse_target: 0.7539 train_rmse_stderror: 0.0 train_kl_div: 0.2347
val_rmse_target: 1.036 val_rmse_stderror: 2.303
New best_val_rmse: 1.036

64 steps took 71.8 seconds
Epoch: 0 batch_num: 256
train_rmse_target: 0.8315 train_rmse_stderror: 0.0 train_kl_div: 0.2171
val_rmse_target: 0.8661 val_rmse_stderror: 2.222
New best_val_rmse: 0.8661

64 steps took 72.0 seconds
Epoch: 1 batch_num: 37
train_rmse_target: 0.7237 train_rmse_stderror: 0.0 train_kl_div: 0.2056
val_rmse_target: 0.7146 val_rmse_stderror: 2.167
New best_val_rmse: 0.7146

64 steps took 

Some weights of the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN thi


64 steps took 72.5 seconds
Epoch: 0 batch_num: 64
train_rmse_target: 1.009 train_rmse_stderror: 2.21 train_kl_div: 0.5084
val_rmse_target: 0.9098 val_rmse_stderror: 0.9929
New best_val_rmse: 0.9098

64 steps took 71.7 seconds
Epoch: 0 batch_num: 128
train_rmse_target: 0.7558 train_rmse_stderror: 2.343 train_kl_div: 0.3723
val_rmse_target: 0.6756 val_rmse_stderror: 0.956
New best_val_rmse: 0.6756

64 steps took 71.6 seconds
Epoch: 0 batch_num: 192
train_rmse_target: 0.3471 train_rmse_stderror: 1.652 train_kl_div: -0.1112
val_rmse_target: 0.6438 val_rmse_stderror: 0.9626
New best_val_rmse: 0.6438

64 steps took 71.7 seconds
Epoch: 0 batch_num: 256
train_rmse_target: 1.405 train_rmse_stderror: 4.461 train_kl_div: 0.9919
val_rmse_target: 1.201 val_rmse_stderror: 1.645
Still best_val_rmse: 0.6438 (from epoch 0)

64 steps took 71.9 seconds
Epoch: 1 batch_num: 37
train_rmse_target: 1.885 train_rmse_stderror: 5.953 train_kl_div: 1.291
val_rmse_target: 1.611 val_rmse_stderror: 2.32
Still best_

Some weights of the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN thi


64 steps took 72.5 seconds
Epoch: 0 batch_num: 64
train_rmse_target: 1.211 train_rmse_stderror: 0.0 train_kl_div: 0.6875
val_rmse_target: 1.007 val_rmse_stderror: 2.425
New best_val_rmse: 1.007

64 steps took 71.6 seconds
Epoch: 0 batch_num: 128
train_rmse_target: 0.6009 train_rmse_stderror: 0.0 train_kl_div: 0.02684
val_rmse_target: 1.036 val_rmse_stderror: 1.928
Still best_val_rmse: 1.007 (from epoch 0)

64 steps took 72.1 seconds
Epoch: 0 batch_num: 192
train_rmse_target: 0.5558 train_rmse_stderror: 0.0 train_kl_div: -0.04053
val_rmse_target: 0.7149 val_rmse_stderror: 1.907
New best_val_rmse: 0.7149

64 steps took 71.7 seconds
Epoch: 0 batch_num: 256
train_rmse_target: 0.5628 train_rmse_stderror: 0.0 train_kl_div: -0.02533
val_rmse_target: 0.6877 val_rmse_stderror: 1.997
New best_val_rmse: 0.6877

64 steps took 71.9 seconds
Epoch: 1 batch_num: 37
train_rmse_target: 0.5272 train_rmse_stderror: 0.0 train_kl_div: -0.1555
val_rmse_target: 0.6016 val_rmse_stderror: 1.87
New best_val_rms

Some weights of the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN thi


64 steps took 72.4 seconds
Epoch: 0 batch_num: 64
train_rmse_target: 0.9348 train_rmse_stderror: 0.0 train_kl_div: 0.4592
val_rmse_target: 0.8217 val_rmse_stderror: 2.053
New best_val_rmse: 0.8217

64 steps took 71.6 seconds
Epoch: 0 batch_num: 128
train_rmse_target: 0.7085 train_rmse_stderror: 0.0 train_kl_div: 0.4601
val_rmse_target: 0.9222 val_rmse_stderror: 2.574
Still best_val_rmse: 0.8217 (from epoch 0)

64 steps took 72.2 seconds
Epoch: 0 batch_num: 192
train_rmse_target: 0.6004 train_rmse_stderror: 0.0 train_kl_div: 0.1748
val_rmse_target: 0.9558 val_rmse_stderror: 2.265
Still best_val_rmse: 0.8217 (from epoch 0)

64 steps took 71.5 seconds
Epoch: 0 batch_num: 256
train_rmse_target: 0.9017 train_rmse_stderror: 0.0 train_kl_div: 0.4631
val_rmse_target: 0.7249 val_rmse_stderror: 1.992
New best_val_rmse: 0.7249

64 steps took 71.9 seconds
Epoch: 1 batch_num: 37
train_rmse_target: 0.9268 train_rmse_stderror: 0.0 train_kl_div: 0.5096
val_rmse_target: 0.8051 val_rmse_stderror: 2.507

Some weights of the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/clrp-pre-trained/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN thi


64 steps took 72.4 seconds
Epoch: 0 batch_num: 64
train_rmse_target: 0.592 train_rmse_stderror: 0.0 train_kl_div: -0.004132
val_rmse_target: 0.8503 val_rmse_stderror: 2.084
New best_val_rmse: 0.8503

64 steps took 71.9 seconds
Epoch: 0 batch_num: 128
train_rmse_target: 0.7921 train_rmse_stderror: 0.0 train_kl_div: 0.5019
val_rmse_target: 0.9865 val_rmse_stderror: 2.557
Still best_val_rmse: 0.8503 (from epoch 0)

64 steps took 71.5 seconds
Epoch: 0 batch_num: 192
train_rmse_target: 1.038 train_rmse_stderror: 0.0 train_kl_div: 0.5391
val_rmse_target: 1.01 val_rmse_stderror: 2.261
Still best_val_rmse: 0.8503 (from epoch 0)

64 steps took 71.5 seconds
Epoch: 0 batch_num: 256
train_rmse_target: 1.112 train_rmse_stderror: 0.0 train_kl_div: 0.6096
val_rmse_target: 1.007 val_rmse_stderror: 2.27
Still best_val_rmse: 0.8503 (from epoch 0)

64 steps took 71.8 seconds
Epoch: 1 batch_num: 37
train_rmse_target: 1.125 train_rmse_stderror: 0.0 train_kl_div: 0.6189
val_rmse_target: 1.005 val_rmse_stde

In [23]:
print(list_val_rmse)

[0.4806371926256924, 0.6437857646853802, 0.47795954871135965, 0.4817149298321294, 0.5946783716826446]


In [24]:
#rep = MemReporter(model)
#rep.report()

In [25]:
#rep = MemReporter(model.roberta)
#rep.report()

In [26]:
#gpuinfo()

In [27]:
#del model
#del optimizer 
#del train_loader
#del val_loader
#del scheduler 
#del list_val_rmse
#del train_indices
#del val_indices
#del tokenizer
#torch.cuda.empty_cache()
#gpuinfo()

# upload models

In [28]:
%cd
!mkdir .kaggle
!mkdir /content/model
!cp /content/drive/MyDrive/Colab_Files/kaggle-api/kaggle.json .kaggle/

!cp -r /content/model_1.pth /content/model/model_1.pth
!cp -r /content/model_2.pth /content/model/model_2.pth
!cp -r /content/model_3.pth /content/model/model_3.pth
!cp -r /content/model_4.pth /content/model/model_4.pth
!cp -r /content/model_5.pth /content/model/model_5.pth

/root


In [29]:
def dataset_upload():
    import json
    from kaggle.api.kaggle_api_extended import KaggleApi

    id = f'{USERID}/{EX_NO}'

    dataset_metadata = {}
    dataset_metadata['id'] = id
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = f'{EX_NO}'

    with open(UPLOAD_DIR / 'dataset-metadata.json', 'w') as f:
        json.dump(dataset_metadata, f, indent=4)

    api = KaggleApi()
    api.authenticate()

    # データセットがない場合
    if f'{USERID}/{EX_NO}' not in [str(d) for d in api.dataset_list(user=USERID, search=f'"{EX_NO}"')]:
        api.dataset_create_new(folder=UPLOAD_DIR,
                               convert_to_csv=False,
                               dir_mode='skip')
    # データセットがある場合
    else:
        api.dataset_create_version(folder=UPLOAD_DIR,
                                   version_notes='update',
                                   convert_to_csv=False,
                                   delete_old_versions=True,
                                   dir_mode='skip')
dataset_upload()



  1%|          | 8.87M/1.33G [00:00<00:21, 66.1MB/s]

Starting upload for file model_2.pth


100%|██████████| 1.33G/1.33G [00:20<00:00, 68.5MB/s]
  1%|          | 10.9M/1.33G [00:00<00:12, 112MB/s]

Upload successful: model_2.pth (1GB)
Starting upload for file model_5.pth


100%|██████████| 1.33G/1.33G [00:12<00:00, 113MB/s]
  0%|          | 0.00/1.33G [00:00<?, ?B/s]

Upload successful: model_5.pth (1GB)
Starting upload for file model_4.pth


100%|██████████| 1.33G/1.33G [00:21<00:00, 65.2MB/s]
  1%|          | 6.87M/1.33G [00:00<00:23, 61.5MB/s]

Upload successful: model_4.pth (1GB)
Starting upload for file model_3.pth


100%|██████████| 1.33G/1.33G [00:15<00:00, 92.6MB/s]
  1%|          | 6.87M/1.33G [00:00<00:22, 61.7MB/s]

Upload successful: model_3.pth (1GB)
Starting upload for file model_1.pth


100%|██████████| 1.33G/1.33G [00:20<00:00, 70.6MB/s]


Upload successful: model_1.pth (1GB)
