In [None]:
import re
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import  DataLoader, Dataset
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

path_train = '../input/commonlitreadabilityprize/train.csv'
path_test = '../input/commonlitreadabilityprize/test.csv'
path_sub = '../input/commonlitreadabilityprize/sample_submission.csv'

SEED =13
np.random.seed(SEED)
torch.manual_seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'


train_data = pd.read_csv(path_train)
test_data = pd.read_csv(path_test)
num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)
target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()


def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))


def clean_text(txt):
    return re.sub('[^A-Za-z]+', ' ',str(txt).lower())


df = pd.read_csv(path_train)
test = pd.read_csv(path_test)
df['txt'] = df['excerpt']#.apply(lambda x: clean_text(x))
test['txt'] = test['excerpt']#.apply(lambda x: clean_text(x))
test.head(2), transformers.__version__

In [None]:
class CL_Dataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        token,
        max_len: int = 256,
        test: bool = False
        ) -> dict:
        self.data = data 
        self.max_len = max_len
        self.test = test
        self.token = token

    def __len__(self):
        return self.data.shape[0]

    def  __getitem__(self, idx: int):
        text = self.data.txt.iloc[idx]
        encode = self.token(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            add_special_tokens=True,            
            return_attention_mask=True,
            return_token_type_ids=False  
            )
        if self.test:
            target = 0
        else:
            target = self.data.target.iloc[idx]

        ids = encode["input_ids"]
        mask = encode["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            'target': torch.tensor(target, dtype = torch.float)  
        }
    

class BertModel(nn.Module): 
    def __init__(self, name_model):
        super().__init__() 

        config = transformers.AutoConfig.from_pretrained(name_model)
        self.model = transformers.AutoModel.from_pretrained(name_model, config=config)
        
        self.layer_norm1 = nn.LayerNorm(1024)
        self.l1 = nn.Linear(1024, 512)
        self.l2 = nn.Linear(512, 1)

        self._init_weights(self.layer_norm1)
        self._init_weights(self.l1)
        self._init_weights(self.l2)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, input_ids, attention_mask):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]     
        out = torch.mean(last_hidden_state, 1)        
        out = self.layer_norm1(out)     
        return out 
    

@torch.no_grad()
def inference2(path_pretrain: list, data: pd.DataFrame):
    tokenizer = transformers.AutoTokenizer.from_pretrained(path_pretrain[0])
    model = BertModel(path_pretrain[0])
    model.load_state_dict(
        torch.load(path_pretrain[1])
    )
    
    model.to(device)
    model.eval()
    test = CL_Dataset(data, tokenizer, MAX_LEN, True)
    test_loader = DataLoader(
        test,
        batch_size=1,
        pin_memory=True,
        shuffle=False
    )
    all_pred = []
    for input in tqdm(test_loader):
        batch = {k:v.to(device) for k,v in input.items() if k != 'target'}   
        out = model(**batch) 
        all_pred.extend(out.squeeze(-1).detach().cpu().numpy())
    return np.array(all_pred)

In [None]:
MAX_LEN = 248
a  = [
    '../input/hugg-hub/roberta/large',
    '../input/train-autonl-change-head-roberta/Model_roberta_model_0.pth'
]
f1 = inference2(a, df)
f1_test = inference2(a, test)
# ((2834, 1024), (7, 1024))
a  = [
    '../input/hugg-hub/roberta/large',
    '../input/train-autonl-change-head-roberta/Model_roberta_model_1.pth'
]
f2 = inference2(a, df)
f2_test = inference2(a, test)

a  = [
    '../input/hugg-hub/roberta/large',
    '../input/train-autonl-change-head-roberta/Model_roberta_model_2.pth'
]
f3 = inference2(a, df)
f3_test = inference2(a, test)

a  = [
    '../input/hugg-hub/roberta/large',
    '../input/train-autonl-change-head-roberta/Model_roberta_model_3.pth'
]
f4 = inference2(a, df)
f4_test = inference2(a, test)

a  = [
    '../input/hugg-hub/roberta/large',
    '../input/train-autonl-change-head-roberta/Model_roberta_model_4.pth'
]
f5 = inference2(a, df)
f5_test = inference2(a, test)

100%|██████████| 2834/2834 [01:44<00:00, 27.18it/s]
100%|██████████| 7/7 [00:00<00:00, 26.19it/s]
100%|██████████| 2834/2834 [01:43<00:00, 27.47it/s]
100%|██████████| 7/7 [00:00<00:00, 26.86it/s]
100%|██████████| 2834/2834 [01:43<00:00, 27.37it/s]
100%|██████████| 7/7 [00:00<00:00, 26.92it/s]
100%|██████████| 2834/2834 [01:43<00:00, 27.45it/s]
100%|██████████| 7/7 [00:00<00:00, 25.59it/s]
100%|██████████| 2834/2834 [01:42<00:00, 27.54it/s]
100%|██████████| 7/7 [00:00<00:00, 25.66it/s]


In [None]:
def get_preds_svm(X,y,X_test,RidgeReg=0,bins=bins,nfolds=10,C=8,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=13)
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        if(RidgeReg):
            print("ridge...")
            model = Ridge(alpha=80.0)
        else:
            model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds

In [None]:
svm_preds0 = get_preds_svm(f1,target,f1_test)
ridge_preds0 = get_preds_svm(f1,target,f1_test,RidgeReg=1)
del f1,f1_test
gc.collect()

svm_preds1 = get_preds_svm(f2,target,f2_test)
ridge_preds1 = get_preds_svm(f2,target,f2_test,RidgeReg=1)
del f2,f2_test
gc.collect()

svm_preds2 = get_preds_svm(f3,target,f3_test)
ridge_preds2 = get_preds_svm(f3,target,f3_test,RidgeReg=1)
del f3,f3_test
gc.collect()

svm_preds3 = get_preds_svm(f4,target,f4_test)
ridge_preds3 = get_preds_svm(f4,target,f4_test,RidgeReg=1)
del f4,f4_test
gc.collect()

svm_preds4 = get_preds_svm(f5,target,f5_test)
ridge_preds4 = get_preds_svm(f5,target,f5_test,RidgeReg=1)
del f5,f5_test
gc.collect()

Fold 0 , rmse score: 0.3058170718558509
Fold 1 , rmse score: 0.2872988386502568
Fold 2 , rmse score: 0.29262937725758875
Fold 3 , rmse score: 0.30057279139325893
Fold 4 , rmse score: 0.2744710902744394
Fold 5 , rmse score: 0.26813241268625937
Fold 6 , rmse score: 0.2702235520666963
Fold 7 , rmse score: 0.3165840947719694
Fold 8 , rmse score: 0.34478206986823035
Fold 9 , rmse score: 0.3215765205850262
mean rmse 0.2982087819409576
ridge...
Fold 0 , rmse score: 0.308659955097698
ridge...
Fold 1 , rmse score: 0.28873094089967183
ridge...
Fold 2 , rmse score: 0.29897856609655066
ridge...
Fold 3 , rmse score: 0.30474019696729027
ridge...
Fold 4 , rmse score: 0.2765261191423368
ridge...
Fold 5 , rmse score: 0.2672864545633807
ridge...
Fold 6 , rmse score: 0.26835640841627695
ridge...
Fold 7 , rmse score: 0.3168510974492049
ridge...
Fold 8 , rmse score: 0.3492517882722234
ridge...
Fold 9 , rmse score: 0.32646390869590375
mean rmse 0.3005845435600537
Fold 0 , rmse score: 0.27950989330885095
Fol

0

In [None]:
# svm_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds0)/5
all_preds = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds0+ridge_preds0+ridge_preds1+ridge_preds2+ridge_preds3+ridge_preds4)/10