In [1]:
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Sampler, SequentialSampler

import multiprocessing
import more_itertools

from transformers import (AutoModel, AutoConfig, AutoTokenizer, AutoModelForSequenceClassification)

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

In [2]:
from pathlib import Path
in_folder_path = Path('../input/final-clrp-roberta-large-finetuned')
scripts_dir = Path(in_folder_path / 'scripts')

In [3]:
os.chdir(scripts_dir)
exec(Path("config.py").read_text())
exec(Path("dataset.py").read_text())
exec(Path("model.py").read_text())
os.chdir('/kaggle/working')

In [4]:
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
tokenizer = AutoTokenizer.from_pretrained('../input/huggingface-roberta/roberta-large')
models_folder_path = Path(in_folder_path / 'models')
models_preds = []
n_models = 5

In [5]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [6]:
for model_num in range(n_models):
    seed_everything(seed=Config.seed + model_num)
    print(f'Inference#{model_num+1}/{n_models}')
    
    test_ds = CLRPDataset(data=test_df, tokenizer=tokenizer, max_len=Config.max_len, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=Config.batch_size)
    
    config = AutoConfig.from_pretrained('../input/huggingface-roberta/roberta-large')
    config.update({"output_hidden_states":True, "hidden_dropout_prob": 0.0,"layer_norm_eps": 1e-7})
    
    model = CLRPModel('../input/huggingface-roberta/roberta-large')
    model = torch.load(models_folder_path / f'CLRPModel_{model_num}.pt').to(Config.device)

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
        with torch.no_grad():
            preds = model(sent_id, mask)
            all_preds += preds.flatten().cpu().tolist()
    
    models_preds.append(all_preds)

Inference#1/5
Inference#2/5
Inference#3/5
Inference#4/5
Inference#5/5


In [7]:
models_preds = np.array(models_preds)
print(models_preds.shape)
print(models_preds)
all_preds = models_preds.mean(axis=0)
print(all_preds.shape)
result_df = pd.DataFrame(
    {
        'id': test_df.id,
        'target': all_preds
    })


result_df.to_csv('submission.csv', index=False)
result_df.head(10)

(5, 7)
[[-0.55113876 -0.34380105 -0.49665588 -2.36000586 -1.78107643 -1.15263045
   0.24086733]
 [-0.63002014 -0.35222095 -0.39688    -2.2310462  -1.81024551 -1.16065764
   0.4702352 ]
 [-0.54211235 -0.44907802 -0.51143807 -2.34893084 -1.90068424 -0.96490979
   0.16695571]
 [-0.63765979 -0.28730297 -0.37110433 -2.30051732 -1.8460176  -1.28922296
   0.39916942]
 [-0.39447218 -0.34987035 -0.37423345 -2.294415   -1.78868854 -1.23625112
   0.30187508]]
(7,)


Unnamed: 0,id,target
0,c0f722661,-0.551081
1,f0953f0a5,-0.356455
2,0df072751,-0.430062
3,04caf4e0c,-2.306983
4,0e63f8bea,-1.825342
5,12537fe78,-1.160734
6,965e592c0,0.315821
