In [1]:
import sys
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
sys.path.insert(0, PATH_TO_TRANSFORMERS_WITH_FAST_TOKENIZER_FOR_DEBERTA_V2)
import torch as t
t.autograd.set_grad_enabled(False)
from tqdm import tqdm
from glob import glob
import pandas as pd
import numpy as np
import re
from transformers import DebertaV2TokenizerFast, DebertaV2Model, DebertaV2Config
import dill as pickle

In [2]:
class TvmLongformer(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.feats = DebertaV2Model.from_pretrained('microsoft/deberta-v2-xlarge')
        self.feats.pooler = None
        self.class_projector = t.nn.Sequential(
            t.nn.LayerNorm(1536),
            t.nn.Linear(1536, 15)
        )
    def forward(self, tokens, mask):
        return self.class_projector(self.feats(tokens, mask, return_dict=False)[0])
    
model = TvmLongformer()

Some weights of the model checkpoint at microsoft/deberta-v2-xlarge were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
debertav3_fix_text = lambda x: x.replace('\n', '‽')
class Dataset(t.utils.data.Dataset):
    def __init__(self, files):
        tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v2-xlarge')
        tokenizer.model_max_length = 4096
        self.tokenizer = tokenizer
        self.inverse_tokenizer_vocab = {y: x for x, y in tokenizer.vocab.items()}
        files = set(files)
        self.texts = {}
        for file_id in files:
            with open(f'train/{file_id}.txt') as f:
                self.texts[file_id] = f.read().strip()
        self.keys = sorted(self.texts.keys())
        self.space_regex = re.compile('[\s\n]')
    def __len__(self):
        return len(self.keys)
    def __getitem__(self, ix):
        tokens_array = np.zeros(4096, 'i8')
        mask_array = np.zeros(4096, 'f4')
        offsets_array = np.zeros((4096, 2), 'i4')
        
        text = self.texts[self.keys[ix]]
        key = self.keys[ix]
        tokenizer_outs = self.tokenizer(debertav3_fix_text(text), return_offsets_mapping=True)
        tokenizer_outs['input_ids'] = [x if x != 126599 else 128000 for x in tokenizer_outs['input_ids']]
        tokens = np.array(tokenizer_outs['input_ids'], 'i8')
        mask = np.array(tokenizer_outs['attention_mask'], 'f4')
        default_offset_mappings = tokenizer_outs['offset_mapping']
        ids = tokenizer_outs['input_ids']
        num_tokens = len(ids)
        offset_mappings = [(0,0)]
        for ix in range(1, num_tokens - 1):
            a, b = default_offset_mappings[ix]
            token = self.inverse_tokenizer_vocab[ids[ix]]
            if len(token) > 1 and token[0] == '▁' and ix != 1:
                a += 1
            offset_mappings.append((a, b))
        offset_mappings.append((0,0))
        
        offsets = np.vstack(offset_mappings).astype('i4')
        
        
        tokens_array[:len(tokens)] = tokens
        mask_array[:len(tokens)] = mask
        offsets_array[:len(tokens)] = offsets
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
            
        return tokens_array, mask_array, offsets_array, index_map, key, len(tokens)
    
first_batch = True
def collate_fn(ins):
    global first_batch
    if first_batch:
        max_len = 2048
        first_batch = False
    else:
        max_len = (max(x[-1] for x in ins) + 7) // 8 * 8
    return tuple(t.from_numpy(np.concatenate([ins[z][x][None, :max_len]
                                              for z in range(len(ins))]))
                 for x in range(len(ins[0]) - 3)) \
                 + ([x[-3] for x in ins], [x[-2] for x in ins], np.array([x[-1] for x in ins]),)    


label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']


In [None]:
checkpoints = glob('checkpoints_xlarge_v2/extra_distclean_mse*')

In [None]:
#sort by fold
#sorted_checkpoints = sorted(checkpoints, key=lambda x: int(x.split('/')[-1].split('_')[3][-1]))

In [5]:
len(checkpoints)

5

In [6]:
model.eval().cuda();

In [7]:
with open('id_to_ix_map.pickle', 'rb') as f:
    id_to_ix_map = {x.split('/')[-1].split('.')[0]: y for x, y in pickle.load(f).items()}
with open('data_splits.pickle', 'rb') as f:
    data_splits = pickle.load(f)

In [8]:
all_outs = np.zeros((len(glob('train/*.txt')), 2048, 15), 'f4')
all_bounds = np.zeros((len(glob('train/*.txt')), 2048, 2), 'i4')
all_token_nums = np.zeros((len(glob('train/*.txt')),), 'i4')
all_word_indices = []
all_sample_ids = []
ix = 0
for fold_ix, checkpoint in enumerate(sorted_checkpoints):
    val_files = data_splits[0][250]['normed'][fold_ix]
    dataset = t.utils.data.DataLoader(Dataset(val_files), collate_fn=collate_fn,
                                  batch_size=1, num_workers=2)
    model.load_state_dict(t.load(checkpoint), strict=False);
    for batch in tqdm(dataset):
        tokens, mask, bounds, word_indices, sample_ids, num_tokens = batch
        num_tokens = num_tokens[0]
        batch_size, batch_len = tokens.shape[:2]
        outs = t.log_softmax(model(tokens.cuda(), mask.cuda()), -1)
        all_outs[ix: ix + batch_size, :num_tokens - 2] = outs[:, 1:num_tokens - 1].cpu().numpy()
        all_bounds[ix: ix + batch_size, :num_tokens - 2] = bounds[:, 1:num_tokens - 1]
        all_token_nums[ix: ix + batch_size] = num_tokens - 2
        all_word_indices.extend(word_indices)
        all_sample_ids.extend(sample_ids)
        ix += batch_size

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3140/3140 [02:16<00:00, 22.99it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3121/3121 [02:17<00:00, 22.70it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3139/3139 [02:17<00:00, 22.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3096/3096 [02:15<00:00, 22.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3098/3098 [02:16<00:00, 22.68it/s]


In [None]:
model_name = 

In [None]:
if not os.path.exists(f'oof_ps/{model_name}'):
    os.makedirs(f'oof_ps/{model_name}')

In [9]:
for (array, 
    array_name) in zip((all_outs, all_bounds, all_token_nums, all_word_indices, all_sample_ids),
                        'all_outs, all_bounds, all_token_nums, all_word_indices, all_sample_ids'.split(', ')):
    np.save(f'oof_ps/{model_name}/{array_name}.npy', array)

  arr = np.asanyarray(arr)
