In [1]:
import sys

DATASET_PATH = ('../../../feedback-prize-2021')

sys.path.insert(0, '../codes/new_transformers_branch/transformers/src')
sys.path.append('../codes')
sys.path.append('..')

In [2]:
import os
import os.path as osp

import re
import pickle
import random
import easydict

from glob import glob
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from new_transformers import DebertaV2TokenizerFast

import torch
from torch.nn import functional as F

torch.autograd.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f3e3f120a90>

In [3]:
from module.metric import calc_acc, process_sample, make_match_dict

from module.utils import get_data_files
from module.dataset import get_dataloader
from module.loss import get_criterion
from module.optimizer import get_optimizer
from module.scheduler import get_scheduler
from model.model import get_model

In [4]:
# global variables
seed = 0

# change here
ckpt_path = '../input/feedbackv2/debertav1/*'

In [5]:
args = easydict.EasyDict({'model': 'microsoft/deberta-v3-large-ducky',
                          'grad_checkpt': True,
                          'cnn1d': False,
                          'extra_dense': False,
                          'device': 0,
                          'ddp': False})

In [7]:
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":4096:8"
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(seed)

In [8]:
seed = 0
val_fold = 0

with open('../data_file/id_to_ix_map.pickle', 'rb') as f:
    id_to_ix_map = {x.split('/')[-1].split('.')[0]: y for x, y in pickle.load(f).items()}
with open('../data_file/data_splits.pickle', 'rb') as f:
    data_splits = pickle.load(f)

val_files_all = []
for val_fold in range(5):
    train_ids = [
        id_to_ix_map[x] 
        for fold in range(5) if fold != val_fold 
        for x in data_splits[seed][250]['normed'][fold]
    ]
    val_files = data_splits[seed][250]['normed'][val_fold]
    val_ids = [id_to_ix_map[x] for x in val_files]
    val_files_all.append(val_files)

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, files):
        tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-large')
        tokenizer.model_max_length = 2048
        self.tokenizer = tokenizer
        
        self.texts = {}
        self.raw_texts = {}
        
        for file in files:
            file_name = osp.join(DATASET_PATH, 'train', file + '.txt')
            with open(file_name) as f:
                text = f.read().strip()
                self.texts[file] = fix_text(text)
                self.raw_texts[file] = text
                
        self.text_ids = list(self.texts.keys())
        self.space_regex = re.compile('[\s\n]')
        
    def __len__(self):
        return len(self.text_ids)
    
    def __getitem__(self, ix):
        tokens_array = np.zeros(2048, 'i8')
        mask_array = np.zeros(2048, 'f4')
        offsets_array = np.zeros((2048, 2), 'i4')
        
        text = self.texts[self.text_ids[ix]]
        raw_text = self.raw_texts[self.text_ids[ix]]
        text_id = self.text_ids[ix]
        
        tokenizer_outs = self.tokenizer(text, return_offsets_mapping=True)
        tokenizer_outs['input_ids'] = [x if x != 126861 else 128000 for x in tokenizer_outs['input_ids']]
        
        tokens = np.array(tokenizer_outs['input_ids'], 'i8')
        mask = np.array(tokenizer_outs['attention_mask'], 'f4')
        offsets = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
        
        tokens_array[:len(tokens)] = tokens
        tokens_array[len(tokens):] = 0
        mask_array[:len(tokens)] = mask
        mask_array[len(tokens):] = 0
        offsets_array[:len(tokens)] = offsets
        offsets_array[len(tokens):] = 0
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(raw_text.index(raw_text.strip()[0]), len(raw_text)):
            if self.space_regex.match(raw_text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
        
        return tokens_array, mask_array, offsets_array, index_map, text_id, len(tokens)
    

In [25]:
first_batch = True
def collate_fn(ins):
    global first_batch
    if first_batch:
        max_len = 2048
        first_batch = False
    else:
        max_len = (max(x[-1] for x in ins) + 7) // 8 * 8
    return tuple(
        torch.from_numpy(np.concatenate([ins[z][x][None, :max_len] for z in range(len(ins))])) 
        for x 
        in range(len(ins[0]) - 3)) + \
        ([x[-3] for x in ins], [x[-2] for x in ins], np.array([x[-1] for x in ins]),)    

In [15]:
def map_span_to_word_indices(span, index_map, bounds):
    return (index_map[bounds[span[0], 0]], 
            index_map[bounds[span[1], 1] - 1])

def calc_entity_score(span, ps, c):
    s, e = span
    score = (ps[s, c * 2 - 1] + ps[s + 1: e + 1, c * 2].sum())/(e - s + 1)
    return score

In [16]:
def extract_entities(ps, n):
    cat_ps = ps.argmax(-1)
    all_entities = {}
    current_cat = None
    current_start = None
    for ix in range(1, n - 1):
        # B-LABEL(1, 3, 5, 7, ...)
        if cat_ps[ix] % 2 == 1:
            if current_cat is not None:
                if current_cat not in all_entities:
                    all_entities[current_cat] = []
                all_entities[current_cat].append((current_start, ix - 1))
            current_cat = (cat_ps[ix] + 1) // 2
            current_start = ix
        # O
        elif cat_ps[ix] == 0:
            if current_cat is not None:
                if current_cat not in all_entities:
                    all_entities[current_cat] = []
                all_entities[current_cat].append((current_start, ix - 1))
            current_cat = None
        elif current_cat is not None and cat_ps[ix] != current_cat * 2:
            if current_cat not in all_entities:
                all_entities[current_cat] = []
            all_entities[current_cat].append((current_start, ix - 1))
            current_cat = None
    if current_cat is not None:
        if current_cat not in all_entities:
            all_entities[current_cat] = []
        all_entities[current_cat].append((current_start, ix))

    return all_entities

In [17]:
model = get_model(args)


Using Ducky Modified DebertaV2



Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
fix_text = lambda x: x.replace('\n', '‽')

In [26]:
label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']


outs_f = []
bounds_f = []
token_nums_f = []
word_indices_f = []
sample_ids_f = []

checkpoints = os.listdir('../result/random_deletion_0.1')
checkpoints = sorted(checkpoints, key=lambda checkpoint: checkpoint[checkpoint.index('fold') + 4])
checkpoints = [osp.join('../result/random_deletion_0.1', checkpoint) for checkpoint in checkpoints]


for val_fold in range(5):
    val_files = val_files_all[val_fold]
    dataset = Dataset(val_files)
    dataset = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn,
                                      batch_size=1, num_workers=2, shuffle=False)

    model.eval().cuda();

    all_outs = np.zeros((len(val_files), 2048, 15), 'f4')
    all_bounds = np.zeros((len(val_files), 2048, 2), 'i4')
    all_token_nums = np.zeros(len(val_files), 'i4')
    all_word_indices = []
    all_sample_ids = []
    model.load_state_dict(torch.load(checkpoints[val_fold]));
    ix = 0
    for batch in tqdm(dataset, leave=False):
        tokens, mask, bounds, word_indices, sample_ids, num_tokens = batch
        batch_size, batch_len = tokens.shape[:2]
        with torch.no_grad():
            # outs = t.softmax(model(tokens.cuda(), mask.cuda()), -1)
            outs = model(tokens.cuda(), mask.cuda())
        all_outs[ix: ix + batch_size, :batch_len] += outs.cpu().numpy()
        all_bounds[ix: ix + batch_size, :batch_len] = bounds
        all_token_nums[ix: ix + batch_size] = num_tokens
        all_word_indices.extend(word_indices)
        all_sample_ids.extend(sample_ids)
        ix += batch_size
    
    outs_f.append(all_outs)
    bounds_f.append(all_bounds)
    token_nums_f.append(all_token_nums)
    word_indices_f += all_word_indices
    sample_ids_f += all_sample_ids

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3140 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3121 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3139 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3096 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/3098 [00:00<?, ?it/s]

In [31]:
all_outs = np.concatenate(outs_f)
all_bounds = np.concatenate(bounds_f)
all_token_nums = np.concatenate(token_nums_f)
all_word_indices = word_indices_f
all_sample_ids = sample_ids_f
all_texts = {}
for sid in all_sample_ids:
    fname = osp.join(f'{DATASET_PATH}/train/', sid+'.txt')
    with open(fname) as f:
        all_texts[sid] = f.read().strip()

In [32]:
all_outs.shape, all_bounds.shape, all_token_nums.shape, len(all_word_indices), len(all_sample_ids)

((15594, 2048, 15), (15594, 2048, 2), (15594,), 15594, 15594)

In [34]:
oofs = {
    'all_outs': all_outs,
    'all_bounds': all_bounds,
    'all_token_nums': all_token_nums,
    'all_word_indices': all_word_indices,
    'all_sample_ids': all_sample_ids,
    'all_texts': all_texts
}

In [36]:
pickle.dump(oofs, open('oofs_ducky.pkl', 'wb'))