### Inference is based on  the following kaggle kernels: 
https://www.kaggle.com/akuritsyn/two-longformers-are-better-than-1
and
https://www.kaggle.com/hengck23/1-birdformer-1-longformer-one-fold/

In [None]:
import gc
gc.enable()

import sys
sys.path.append("../tez/")

import os

import numpy as np
import pandas as pd
from scipy.stats import rankdata

import tez
import torch
import torch.nn as nn
from joblib import Parallel, delayed
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

# added for metrics computation
sys.path.append("../src/")
from utils import FeedbackDatasetValid, score_feedback_comp, \
    score_feedback_comp_micro, calc_overlap, prepare_training_data, _prepare_training_data_helper

In [None]:
torch.cuda.set_device(0)

n_models = 5

num_discourse_marker = 15

class mod_args:
    input_path = "../data/"
    input = "../data/"
    model_def = [
        "../model/microsoft/deberta-large",
        "../model/microsoft/deberta-large",
        "../model/allenai/longformer-large-4096/",
        "../model/microsoft/deberta-v3-large",
        "../model/microsoft/deberta-v3-large",
    ]
    model_path = [
        "../output/deberta-large1-1024",
        "../output/deberta-large2-1024",
        "../output/fb-longformer-large-1536/",
        "../output/deberta-v3-large1-1024",
        "../output/deberta-v3-large2-1024",
    ]
    folds = [[1], [1], [1], [1], [1]]
    output = "."
    batch_size = [1, 1, 4, 4, 4]
    max_len = 4096

In [None]:
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}

id_target_map = {v: k for k, v in target_id_map.items()}

In [None]:
class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        input_ids = [self.tokenizer.cls_token_id] + input_ids

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]

        # add end token id to the input_ids
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)

        return {
            "ids": input_ids,
            "mask": attention_mask,
        }

In [None]:
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]

        # convert to tensors
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)

        return output

In [None]:
def _prepare_test_data_helper(args, tokenizer, ids):
    test_samples = []
    for idx in ids:
        filename = os.path.join(args.input_path, "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        test_samples.append(sample)
    return test_samples


def prepare_test_data(df, tokenizer, args):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)

    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)

    return test_samples

In [None]:
class FeedbackModel(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)

        hidden_dropout_prob: float = 0.15
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}

In [None]:
# Using ensembling approach from https://www.kaggle.com/hengck23/1-birdformer-1-longformer-one-fold/

FOLD =  mod_args.folds[0][0]
print(FOLD)

df = pd.read_csv(os.path.join(mod_args.input_path, "train_folds.csv"))
valid_df = df[df["kfold"] == FOLD].reset_index(drop=True)

result_nn = []

for i in range(n_models):
    
    if mod_args.model_def[i] == "../model/microsoft/deberta-v3-large":
        tokenizer = DebertaV2TokenizerFast.from_pretrained(mod_args.model_def[i])
    else:
        tokenizer = AutoTokenizer.from_pretrained(mod_args.model_def[i])
        
    collate = Collate(tokenizer)

    test_samples = prepare_training_data(valid_df, tokenizer, mod_args, num_jobs=4)
    test_samples.sort(key=lambda d: d["text_len"], reverse=False)
    test_dataset = FeedbackDatasetValid(test_samples, mod_args.max_len, tokenizer)

    # print(mod_args.model_def[i])
    model = FeedbackModel(model_name=mod_args.model_def[i], num_labels=len(target_id_map) - 1)
    
    n_folds = len(mod_args.folds[i])
    model_prob = []
    first_fold = mod_args.folds[i][0]
    
    for _fold in mod_args.folds[i]:
        # print(mod_args.model_path[i])
        model.load(os.path.join(mod_args.model_path[i], f"model_{_fold}.bin"), weights_only=True)
        model.eval()

        preds_iter = model.predict(test_dataset, batch_size=mod_args.batch_size[i], n_jobs=-1, collate_fn=collate,)

        current_idx = 0
        for preds in preds_iter:
            preds = preds.astype(np.float16)
            preds = preds / n_folds
            
            if _fold == first_fold:
                model_prob.append(preds)
            else:
                model_prob[current_idx] += preds
                current_idx += 1
            
        torch.cuda.empty_cache()
        gc.collect()
            
    model_prob1 = [item for sublist in model_prob for item in sublist]
    for j in range(len(test_samples)):
        test_samples[j]["probability"] = model_prob1[j]

    result_nn.append(test_samples)

In [None]:
length_threshold = {
    'Lead'                : 6,
    'Position'            : 4,
    'Claim'               : 3,
    'Counterclaim'        : 7,
    'Rebuttal'            : 4,
    'Evidence'            : 14,
    'Concluding Statement': 7,
}

probability_threshold = {
    'Lead'                : 0.55,
    'Position'            : 0.55,
    'Claim'               : 0.50,
    'Counterclaim'        : 0.50,
    'Rebuttal'            : 0.55,
    'Evidence'            : 0.60,
    'Concluding Statement': 0.60,
}

def do_threshold(submit_df, use=['length','probability']):
    df = submit_df.copy()
    df = df.fillna('')
    
    if 'probability' in use:
        df['s'] = df.score.apply(lambda x: np.mean(eval(x)))
        for key, value in probability_threshold.items():
            index = df.loc[df['class'] == key].query('s<%f'%value).index
            df.drop(index, inplace=True)

    if 'length' in use:
        df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
        for key, value in length_threshold.items():
            index = df.loc[df['class'] == key].query('l<%d'%value).index
            df.drop(index, inplace=True)

    df = df[['id', 'class', 'predictionstring']]
    return df

In [None]:
def text_to_word(text):
    word = text.split()
    word_offset = []

    start = 0
    for w in word:
        r = text[start:].find(w)

        if r == -1:
            raise NotImplementedError
        else:
            start = start + r
            end = start + len(w)
            word_offset.append((start, end))

        start = end

    return word, word_offset

def word_probability_to_predict_df(text_to_word_probability, id):
    len_word = len(text_to_word_probability)
    word_predict = text_to_word_probability.argmax(-1)
    word_score   = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        if word_predict[t] not in [
            target_id_map['O'],
            target_id_map['PAD'],
        ]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t+1
            if t == len_word-1: break
            continue

        t = t+1
        if t== len_word-1: break

        #----
        if   id_target_map[b_marker_label][0]=='B':
            i_marker_label = b_marker_label+1
        elif id_target_map[b_marker_label][0]=='I':
            i_marker_label = b_marker_label
        else:
            raise NotImplementedError

        while 1:
            if (word_predict[t] != i_marker_label) or (t ==len_word-1):
                end = t
                prediction_string = ' '.join([str(i) for i in range(start,end)]) #np.arange(start,end).tolist()
                discourse_type = id_target_map[b_marker_label][2:]
                discourse_score = word_score[start:end].tolist()
                predict_df.append((id, discourse_type, prediction_string, str(discourse_score)))
                #print(predict_df[-1])
                break
            else:
                t = t+1
                continue
        if t== len_word-1: break

    predict_df = pd.DataFrame(predict_df, columns=['id', 'class', 'predictionstring', 'score'])
    return predict_df

In [None]:
# from https://www.kaggle.com/kaggleqrdl/tensorflow-longformer-ner-postprocessing.
def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])


def link_evidence(oof):
    thresh = 1
    idu = oof['id'].unique()
    idc = idu[1]
    eoof = oof[oof['class'] == "Evidence"]
    neoof = oof[oof['class'] != "Evidence"]
    for thresh2 in range(26, 27, 1):
        retval = []
        for idv in idu:
            for c in  ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement',
                   'Counterclaim', 'Rebuttal']:
                q = eoof[(eoof['id'] == idv) & (eoof['class'] == c)]
                if len(q) == 0:
                    continue
                pst = []
                for i,r in q.iterrows():
                    pst = pst +[-1] + [int(x) for x in r['predictionstring'].split()]
                start = 1
                end = 1
                for i in range(2,len(pst)):
                    cur = pst[i]
                    end = i
                    #if pst[start] == 205:
                    #   print(cur, pst[start], cur - pst[start])
                    if (cur == -1 and c != 'Evidence') or ((cur == -1) and ((pst[i+1] > pst[end-1] + thresh) or (pst[i+1] - pst[start] > thresh2))):
                        retval.append((idv, c, jn(pst, start, end)))
                        start = i + 1
                v = (idv, c, jn(pst, start, end+1))
                #print(v)
                retval.append(v)
        roof = pd.DataFrame(retval, columns = ['id', 'class', 'predictionstring']) 
        roof = roof.merge(neoof, how='outer')
        return roof

In [None]:
num_valid = len(result_nn[0])

submit_df = []
for i in range(num_valid):
    
    text_id = result_nn[0][i]["id"]
    text = result_nn[0][i]["text"]
    word, word_offset = text_to_word(text)
        
    # --- ensemble
    token_to_text_probability = np.full((len(text), num_discourse_marker), 0, np.float32)
    for j in range(n_models):
        p = result_nn[j][i]["probability"][1:] # [:-1]

        for t, (start, end) in enumerate(result_nn[j][i]["offset_mapping"]):
            if t == mod_args.max_len - 1:
                break
            token_to_text_probability[start:end] += p[t]
    token_to_text_probability = token_to_text_probability / n_models
    # -------------
    
    text_to_word_probability = np.full((len(word), num_discourse_marker), 0, np.float32)
    for t, (start, end) in enumerate(word_offset):
        text_to_word_probability[t] = token_to_text_probability[start:end].mean(0)

    predict_df = word_probability_to_predict_df(text_to_word_probability, text_id)
    submit_df.append(predict_df)
    
    # if i % 300 == 0: print(i, text_id, len(text), len(word))
    
submit_df = pd.concat(submit_df).reset_index(drop=True)

submit_df = do_threshold(submit_df, use=['length', 'probability'])
submit_df = link_evidence(submit_df)
# submit_df.to_csv("submission.csv", index=False)

submit_df.head()

In [None]:
print(f"FOLD : {FOLD}")
scr = score_feedback_comp(submit_df, valid_df, return_class_scores=True)
print(scr)