In [25]:
import os
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

from custom.pooling import *

# Model

In [2]:
class CFG_model23:
    num_workers=4
    path="../input/model23/"
    model_name = 'model23'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=12
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'MeanPooling'

    
    
class CFG_model52:
    num_workers=4
    path="../input/model52/"
    model_name = 'model52'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=12
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'MeanPooling'
    
    
class CFG_model68:
    num_workers=4
    path="../input/model68/"
    model_name = 'model68'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'WeightedLayerPooling'
    
    
class CFG_model70:
    num_workers=4
    path="../input/model70/"
    model_name = 'model70'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'LSTMPooling'
    hidden_size = 512
    
    
class CFG_model71:
    num_workers=4
    path="../input/model71/"
    model_name = 'model71'
    config_path=path+'config.pth'
    model="microsoft-deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=5 
    trn_fold=[0, 1, 2, 3, 4]
    max_len=1462
    set_from_df = True
    pooling_type = 'WeightedLayerPooling'

In [3]:
cfg_list = [
    CFG_model23,
    CFG_model52, 
    CFG_model68,
    CFG_model70,
    CFG_model71
]

In [4]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
            
        if pretrained:
            self.backbone = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.backbone = AutoModel.from_config(self.config)
        
        if cfg.pooling_type == 'MeanPooling':
            self.pool = MeanPooling()
        elif cfg.pooling_type == 'WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
        elif cfg.pooling_type == 'LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
        else:
            raise ValueError('Unknown pooling type')
        
        
        if cfg.pooling_type == 'GRUPooling':
            self.fc = nn.Linear(self.cfg.hidden_size, 6)
        elif cfg.pooling_type == 'LSTMPooling':
            self.fc = nn.Linear(self.cfg.hidden_size, 6)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 6)

        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.backbone(**inputs)
        
        last_hidden_states = outputs[0]
        
        if self.cfg.pooling_type == 'MeanPooling':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling_type == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling_type in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        else:
            raise ValueError('Unknown pooling type')
        
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [5]:
CFG = cfg_list[0]
model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)

In [6]:
fold = 0
state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        
model.load_state_dict(state['model'])

<All keys matched successfully>

# Setting up Data

In [8]:
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

In [31]:
# Read data.
tmp = pd.read_csv("../input/prep_cleaned_train_context_5fold.csv", lineterminator="\n")

In [32]:
tmp.shape

(615170, 20)

In [33]:
class custom_dataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.texts = df['text'].values
        self.labels = df['target'].values
        
    def prepare_input(self, text):
        inputs = self.tokenizer.encode_plus(
            text, 
            return_tensors = None, 
            add_special_tokens = True, 
            max_length = self.max_len,
            pad_to_max_length = True,
            truncation = True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype = torch.long)
        return inputs
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        inputs = self.prepare_input(self.texts[item])
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

<__main__.CFG_model23 at 0x7f9efe430c10>

In [37]:
x_train = tmp[tmp['topic_fold'] != fold]
x_val = tmp[tmp['topic_fold'] == fold]
valid_labels = x_val['target'].values

max_len = 650
train_dataset = custom_dataset(x_train, CFG.tokenizer, max_len=max_len)
valid_dataset = custom_dataset(x_val, CFG.tokenizer, max_len=max_len)

train_loader = DataLoader(
    train_dataset, 
    batch_size = 6, 
    shuffle = True, 
    num_workers = 0, 
    pin_memory = True, 
    drop_last = True
)
valid_loader = DataLoader(
    valid_dataset, 
    batch_size = 6, 
    shuffle = False, 
    num_workers = 0, 
    pin_memory = True, 
    drop_last = False
)

In [38]:
x = next(iter(train_loader))



In [39]:
x

[{'input_ids': tensor([[    1,  8462,  1077,  ...,     0,     0,     0],
          [    1, 12981,  1628,  ...,     0,     0,     0],
          [    1, 81954,  5166,  ...,     0,     0,     0],
          [    1, 11515,  1628,  ...,     0,     0,     0],
          [    1,  1093, 79041,  ...,     0,     0,     0],
          [    1, 15466, 90611,  ...,     0,     0,     0]]),
  'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]])},
 tensor([0., 1., 1., 0., 0., 0.])]

In [47]:
len(x)

2

In [42]:
_ = model.eval()

with torch.no_grad():
    z = model(x[0])

In [43]:
z

tensor([[1.0854, 1.7214, 1.9825, 1.6532, 1.9636, 1.9142],
        [1.0259, 1.6969, 2.0091, 1.6750, 1.9524, 1.7996],
        [1.4662, 1.8144, 2.5486, 1.7427, 2.3742, 2.4337],
        [1.4639, 1.9721, 2.1411, 1.9079, 2.2122, 1.8311],
        [0.8910, 1.3641, 1.7269, 1.4557, 1.7484, 1.6525],
        [1.0355, 1.8134, 2.1787, 1.7844, 2.0550, 1.8263]])

In [44]:
def get_param_counts(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    nontrainable_params = total_params - trainable_params
    
    return total_params, trainable_params, nontrainable_params

get_param_counts(model)

(433931270, 433931270, 0)

In [50]:
model

CustomModel(
  (backbone): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128015, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
            

In [54]:
torch.nn.Sequential(**list(model.children())[:-1])

TypeError: type object argument after ** must be a mapping, not list