In [1]:
import sys
sys.path.insert(0, '../input/feedbackv1/codes')
import torch as t
t.autograd.set_grad_enabled(False)
from longformer.longformer import Longformer, LongformerConfig
from tqdm import tqdm
from glob import glob
import pandas as pd
import numpy as np
import re
from transformers import RobertaTokenizerFast, DebertaModel


In [2]:
class TvmLongformer(t.nn.Module):
    def __init__(self):
        super().__init__()
        config = LongformerConfig.from_pretrained(
            '../input/feedbackv1/pretrained_checkpoints/longformer-large-4096/') 
        config.attention_mode = 'sliding_chunks'
        self.feats = Longformer.from_pretrained(
            '../input/feedbackv1/pretrained_checkpoints/longformer-large-4096/', config=config)
        self.feats.pooler = None
        self.class_projector = t.nn.Sequential(
            t.nn.LayerNorm(1024),
            t.nn.Linear(1024, 15)
        )
    def forward(self, tokens, mask):
        return self.class_projector(self.feats(tokens, mask, return_dict=False)[0])
    
class Deberta(t.nn.Module):
    def __init__(self):
        super().__init__()
        self.feats = DebertaModel.from_pretrained(
            '../input/feedbackv1/pretrained_checkpoints/deberta_large/')
        self.feats.pooler = None
        self.class_projector = t.nn.Sequential(
            t.nn.LayerNorm(1024),
            t.nn.Linear(1024, 15)
        )
    def forward(self, tokens, mask):
        return self.class_projector(self.feats(tokens, mask, return_dict=False)[0])
    
deberta = Deberta()
longformer = TvmLongformer()

Some weights of the model checkpoint at ../input/feedbackv1/pretrained_checkpoints/longformer-large-4096/ were not used when initializing Longformer: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing Longformer from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Longformer from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
class LongformerDataset(t.utils.data.Dataset):
    def __init__(self):
        tokenizer = RobertaTokenizerFast.from_pretrained('../input/feedbackv1/tokenizer')
        tokenizer.model_max_length = 4096
        self.tokenizer = tokenizer
        self.texts = {}
        for fname in glob('../input/feedback-prize-2021/test/*.txt'):
            with open(fname) as f:
                self.texts[fname.split('/')[-1].split('.')[0]] = f.read().strip()
        self.keys = list(self.texts.keys())
        self.space_regex = re.compile('[\s\n]')
    def __len__(self):
        return len(self.keys)
    def __getitem__(self, ix):
        tokens_array = np.zeros(4096, 'i8')
        mask_array = np.zeros(4096, 'f4')
        offsets_array = np.zeros((4096, 2), 'i4')
        
        text = self.texts[self.keys[ix]]
        key = self.keys[ix]
        tokenizer_outs = self.tokenizer(text, return_offsets_mapping=True)
        tokens = np.array(tokenizer_outs['input_ids'], 'i8')
        mask = np.array(tokenizer_outs['attention_mask'], 'f4')
        offsets = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
        
        mask[0] = 2
        mask[-1] = 2
        mask[tokens==4] = 2
        mask[tokens==116] = 2
        mask[tokens==328] = 2
        
        tokens_array[:len(tokens)] = tokens
        mask_array[:len(tokens)] = mask
        offsets_array[:len(tokens)] = offsets
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
            
        return tokens_array, mask_array, offsets_array, index_map, key, len(tokens)
    
def collate_fn(ins):
    max_len = (max(x[-1] for x in ins) + 511) // 512 * 512
    return tuple(t.from_numpy(np.concatenate([ins[z][x][None, :max_len]
                                              for z in range(len(ins))]))
                 for x in range(len(ins[0]) - 3)) \
                 + ([x[-3] for x in ins], [x[-2] for x in ins], np.array([x[-1] for x in ins]),)    


label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']

longformer_dataset = t.utils.data.DataLoader(LongformerDataset(), collate_fn=collate_fn,
                                  batch_size=4, num_workers=2)

In [4]:
class DebertaDataset(t.utils.data.Dataset):
    def __init__(self):
        tokenizer = RobertaTokenizerFast.from_pretrained('../input/feedbackv1/tokenizer')
        tokenizer.model_max_length = 4096
        self.tokenizer = tokenizer
        self.texts = {}
        for fname in glob('../input/feedback-prize-2021/test/*.txt'):
            with open(fname) as f:
                self.texts[fname.split('/')[-1].split('.')[0]] = f.read().strip()
        self.keys = list(self.texts.keys())
        self.space_regex = re.compile('[\s\n]')
    def __len__(self):
        return len(self.keys)
    def __getitem__(self, ix):
        tokens_array = np.zeros(4096, 'i8')
        mask_array = np.zeros(4096, 'f4')
        offsets_array = np.zeros((4096, 2), 'i4')
        
        text = self.texts[self.keys[ix]]
        key = self.keys[ix]
        tokenizer_outs = self.tokenizer(text, return_offsets_mapping=True)
        tokens = np.array(tokenizer_outs['input_ids'], 'i8')
        mask = np.array(tokenizer_outs['attention_mask'], 'f4')
        offsets = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
        
        tokens_array[:len(tokens)] = tokens
        mask_array[:len(tokens)] = mask
        offsets_array[:len(tokens)] = offsets
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
            
        return tokens_array, mask_array, offsets_array, index_map, key, len(tokens)
    
first_batch = True
def collate_fn(ins):
    global first_batch
    if first_batch:
        max_len = 2048
        first_batch = False
    else:
        max_len = (max(x[-1] for x in ins) + 7) // 8 * 8
    return tuple(t.from_numpy(np.concatenate([ins[z][x][None, :max_len]
                                              for z in range(len(ins))]))
                 for x in range(len(ins[0]) - 3)) \
                 + ([x[-3] for x in ins], [x[-2] for x in ins], np.array([x[-1] for x in ins]),)    


label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']

deberta_dataset = t.utils.data.DataLoader(DebertaDataset(), collate_fn=collate_fn,
                                  batch_size=1, num_workers=2)

In [5]:
def map_span_to_word_indices(span, index_map, bounds):
    return (index_map[bounds[span[0], 0]], 
            index_map[bounds[span[1], 1] - 1])

def calc_entity_score(span, ps, c):
    s, e = span
    score = (ps[s, c * 2 - 1] + ps[s + 1: e + 1, c * 2].sum())/(e - s + 1)
    return score

def extract_entities(ps, n):
    cat_ps = ps.argmax(-1)
    all_entities = {}
    current_cat = None
    current_start = None
    for ix in range(1, n - 1):
        if cat_ps[ix] % 2 == 1:
            if current_cat is not None:
                if current_cat not in all_entities:
                    all_entities[current_cat] = []
                all_entities[current_cat].append((current_start, ix - 1))
            current_cat = (cat_ps[ix] + 1) // 2
            current_start = ix        
        elif cat_ps[ix] == 0:
            if current_cat is not None:
                if current_cat not in all_entities:
                    all_entities[current_cat] = []
                all_entities[current_cat].append((current_start, ix - 1))
            current_cat = None
        elif current_cat is not None and cat_ps[ix] != current_cat * 2:
            if current_cat not in all_entities:
                all_entities[current_cat] = []
            all_entities[current_cat].append((current_start, ix - 1))
            current_cat = None
    if current_cat is not None:
        if current_cat not in all_entities:
            all_entities[current_cat] = []
        all_entities[current_cat].append((current_start, ix))

    score_thresholds = [None, -.9, -.56, -.55, -.65, -.56, -.76, -.68]
    score_thresholds = [None] + [x * 5 for x in [-5, -3.4, -2.065, -2.5, -6, -5.75, -5.5]]
    
    for cat_ix, min_len in zip(range(1, 8), (2, 2, 5, 2, 4, 3, 2)):
        if cat_ix in all_entities:
            all_entities[cat_ix] = [x for x in all_entities[cat_ix] if x[1] - x[0] + 1 >= min_len and calc_entity_score(x, ps, cat_ix) > score_thresholds[cat_ix]]
            
        
    return all_entities

In [6]:
checkpoints = glob('../input/feedbackv1/weights/*attn2') + glob('../input/feedbackv1/weights/mnli*')
checkpoints.extend(glob('../input/feedbackv2/debertav1/*'))
longformer.eval().cuda();
deberta.eval().cuda();

In [7]:
all_outs = np.zeros((len(glob('../input/feedback-prize-2021/test/*.txt')), 2048, 15), 'f4')
all_bounds = np.zeros((len(glob('../input/feedback-prize-2021/test/*.txt')), 2048, 2), 'i4')
all_token_nums = np.zeros((len(glob('../input/feedback-prize-2021/test/*.txt')),), 'i4')
all_word_indices = []
all_sample_ids = []
for checkpoint_ix, checkpoint in enumerate(checkpoints):
    if checkpoint[-1] == '2':
        longformer.load_state_dict(t.load(checkpoint))
        model = longformer
        dataset = longformer_dataset
        is_longformer = True
    else:
        deberta.load_state_dict(t.load(checkpoint))
        model = deberta
        dataset = deberta_dataset
        is_longformer = False
    ix = 0
    for batch in dataset:
        tokens, mask, bounds, word_indices, sample_ids, num_tokens = batch
        batch_size, batch_len = tokens.shape[:2]
        if is_longformer:
            outs = t.log_softmax(model(tokens.cuda(), mask.cuda()), -1)
        else:
            outs = t.log_softmax(model(tokens.cuda(), (mask.cuda() > 0).float()), -1)
        all_outs[ix: ix + batch_size, :batch_len] += outs.cpu().numpy()
        if checkpoint_ix == 0:
            all_bounds[ix: ix + batch_size, :batch_len] = bounds
            all_token_nums[ix: ix + batch_size] = num_tokens
            all_word_indices.extend(word_indices)
            all_sample_ids.extend(sample_ids)
        ix += batch_size

In [8]:
sub_sample_ids = []
sub_cat_names = []
sub_spans = []
for sample_ix in range(len(all_token_nums)):
    predicted_spans = {x: [map_span_to_word_indices(span, all_word_indices[sample_ix],
                                                    all_bounds[sample_ix]) for span in y] 
                       for x, y in extract_entities(all_outs[sample_ix], 
                                                    all_token_nums[sample_ix]).items()}
    for cat_ix in predicted_spans:
        for entity in predicted_spans[cat_ix]:
            sub_sample_ids.append(all_sample_ids[sample_ix])
            sub_cat_names.append(label_names[cat_ix])
            sub_spans.append(' '.join(str(x) for x in range(entity[0], entity[1] + 1)))

In [9]:
pd.DataFrame({'id': sub_sample_ids, 
              'class': sub_cat_names,
              'predictionstring': sub_spans}).to_csv('submission.csv', index=False);