In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import sys
sys.path.insert(0, '/home/qwe/projects/feedback/models_training/longformer/sumbission/codes')
import torch as t
t.autograd.set_grad_enabled(False)
from longformer.longformer import Longformer, LongformerConfig
from tqdm import tqdm
from glob import glob
import pandas as pd
import numpy as np
import re
from transformers import RobertaTokenizerFast
import dill as pickle


In [2]:
class TvmLongformer(t.nn.Module):
    def __init__(self):
        super().__init__()
        config = LongformerConfig.from_pretrained(
            '/home/qwe/projects/feedback/models_training/longformer/sumbission_large/pretrained_checkpoints/longformer-large-4096/') 
        config.attention_mode = 'sliding_chunks'
        self.feats = Longformer.from_pretrained(
            '/home/qwe/projects/feedback/models_training/longformer/sumbission_large/pretrained_checkpoints/longformer-large-4096/', config=config)
        self.feats.pooler = None
        self.class_projector = t.nn.Sequential(
            t.nn.LayerNorm(1024),
            t.nn.Linear(1024, 15)
        )
    def forward(self, tokens, mask):
        return self.class_projector(self.feats(tokens, mask, return_dict=False)[0])
    
model = TvmLongformer()

Some weights of the model checkpoint at /home/qwe/projects/feedback/models_training/longformer/sumbission_large/pretrained_checkpoints/longformer-large-4096/ were not used when initializing Longformer: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing Longformer from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Longformer from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
class Dataset(t.utils.data.Dataset):
    def __init__(self, files):
        tokenizer = RobertaTokenizerFast.from_pretrained('/home/qwe/projects/feedback/models_training/longformer/sumbission_large/tokenizer')
        tokenizer.model_max_length = 4096
        self.tokenizer = tokenizer
        files = set(files)
        self.texts = {}
        for file_id in files:
            with open(f'../../train/{file_id}.txt') as f:
                self.texts[file_id] = f.read().strip()
        self.keys = sorted(self.texts.keys())
        self.space_regex = re.compile('[\s\n]')
    def __len__(self):
        return len(self.keys)
    def __getitem__(self, ix):
        tokens_array = np.zeros(4096, 'i8')
        mask_array = np.zeros(4096, 'f4')
        offsets_array = np.zeros((4096, 2), 'i4')
        
        text = self.texts[self.keys[ix]]
        key = self.keys[ix]
        tokenizer_outs = self.tokenizer(text, return_offsets_mapping=True)
        tokens = np.array(tokenizer_outs['input_ids'], 'i8')
        mask = np.array(tokenizer_outs['attention_mask'], 'f4')
        mask[0] = 2
        mask[-1] = 2
        mask[tokens==4] = 2
        mask[tokens==116] = 2
        mask[tokens==328] = 2
        
        offsets = np.vstack(tokenizer_outs['offset_mapping']).astype('i4')
        
        tokens_array[:len(tokens)] = tokens
        mask_array[:len(tokens)] = mask
        offsets_array[:len(tokens)] = offsets
        
        index_map = []
        current_word = 0
        blank = False
        for char_ix in range(text.index(text.strip()[0]), len(text)):
            if self.space_regex.match(text[char_ix]) is not None:
                blank = True
            elif blank:
                current_word += 1
                blank = False
            index_map.append(current_word)
            
        return tokens_array, mask_array, offsets_array, index_map, key, len(tokens)
    
first_batch = True
def collate_fn(ins):
    global first_batch
    if first_batch:
        max_len = 2048
        first_batch = False
    else:
        max_len = (max(x[-1] for x in ins) + 511) // 512 * 512
    return tuple(t.from_numpy(np.concatenate([ins[z][x][None, :max_len]
                                              for z in range(len(ins))]))
                 for x in range(len(ins[0]) - 3)) \
                 + ([x[-3] for x in ins], [x[-2] for x in ins], np.array([x[-1] for x in ins]),)    


label_names = ['None', 'Lead', 'Position', 'Evidence', 'Claim',
               'Concluding Statement', 'Counterclaim', 'Rebuttal']


In [4]:
checkpoints = glob('../longformer/sumbission_large/weights/fold*attn2')
model.eval().cuda();

In [5]:
with open('../../id_to_ix_map.pickle', 'rb') as f:
    id_to_ix_map = {x.split('/')[-1].split('.')[0]: y for x, y in pickle.load(f).items()}
with open('../../data_splits.pickle', 'rb') as f:
    data_splits = pickle.load(f)

In [None]:
all_outs = np.zeros((len(glob('../../train/*.txt')), 2048, 15), 'f4')
all_bounds = np.zeros((len(glob('../../train/*.txt')), 2048, 2), 'i4')
all_token_nums = np.zeros((len(glob('../../train/*.txt')),), 'i4')
all_word_indices = []
all_sample_ids = []
ix = 0
for fold_ix, checkpoint in enumerate(sorted(checkpoints, key=lambda x: int(x.split('/')[-1].split('_')[0][-1]))):
    val_files = data_splits[0][250]['normed'][fold_ix]
    dataset = t.utils.data.DataLoader(Dataset(val_files), collate_fn=collate_fn,
                                  batch_size=1, num_workers=2)
    model.load_state_dict(t.load(checkpoint));
    for batch in tqdm(dataset):
        tokens, mask, bounds, word_indices, sample_ids, num_tokens = batch
        batch_size, batch_len = tokens.shape[:2]
        outs = t.log_softmax(model(tokens.cuda(), mask.cuda()), -1)
        all_outs[ix: ix + batch_size, :batch_len] = outs.cpu().numpy()
        all_bounds[ix: ix + batch_size, :batch_len] = bounds
        all_token_nums[ix: ix + batch_size] = num_tokens
        all_word_indices.extend(word_indices)
        all_sample_ids.extend(sample_ids)
        ix += batch_size

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3140/3140 [03:59<00:00, 13.12it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3121/3121 [03:59<00:00, 13.01it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 3139/3139 [03:57<00:00, 13.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 3086/3096 [03:51<00:00, 13.60it/s]

In [None]:
for (array, 
    array_name) in zip((all_outs, all_bounds, all_token_nums, all_word_indices, all_sample_ids),
                        'all_outs, all_bounds, all_token_nums, all_word_indices, all_sample_ids'.split(', ')):
    np.save(f'../oof_ps/longformer/{array_name}.npy', array)