In [1]:
import torch
def getConfig():
    return {'model_name': '../input/checkpoint20000/checkpoint-20000',   
         'token_name': '../input/py-bigbird-v26',
         'max_length': 1024,
         'train_batch_size':12,
         'valid_batch_size':8,
         'epochs':20,
         'learning_rates': [1e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10,
         'device': 'cuda' if torch.cuda.is_available() else 'cpu'}

In [2]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from scipy import stats
import pandas as pd
from transformers import AutoTokenizer,AutoModelForTokenClassification

# Return an array that maps character index to index of word in list of split() words


def split_mapping(unsplit):
    splt = unsplit.split()
    offset_to_wordidx = np.full(len(unsplit), -1)
    txt_ptr = 0
    for split_index, full_word in enumerate(splt):
        while unsplit[txt_ptr:txt_ptr + len(full_word)] != full_word:
            txt_ptr += 1
        offset_to_wordidx[txt_ptr:txt_ptr + len(full_word)] = split_index
        txt_ptr += len(full_word)
    return offset_to_wordidx


def loadFromCSV(path=f'./train_NER.csv'):
    train_text_df = pd.read_csv(path)
    # pandas saves lists as string, we must convert back
    train_text_df.entities = train_text_df.entities.apply(lambda x: eval(x))
    return train_text_df


class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, get_wids,standard=False):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids  # for validation
        output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim',
                         'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

        self.labels_to_ids = {v: k for k, v in enumerate(output_labels)}
        self.ids_to_labels = {k: v for k, v in enumerate(output_labels)}
        print(self.ids_to_labels)
        self.standard=standard

    def __getitem__(self, index):
        # GET TEXT AND WORD LABELS
        text = self.data.text[index]
        word_labels = self.data.entities[index] if not self.get_wids else None

        # TOKENIZE TEXT
        encoding = self.tokenizer(text,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        word_ids = encoding.word_ids()
        split_word_ids = np.full(len(word_ids), -1)
        offset_to_wordidx = split_mapping(text)
        offsets = encoding['offset_mapping']

        # CREATE TARGETS AND MAPPING OF TOKENS TO SPLIT() WORDS
        label_ids = []
        # Iterate in reverse to label whitespace tokens until a Begin token is encountered
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):

            if word_idx is None:
                if not self.get_wids:
                    label_ids.append(-100)
            else:
                if offsets[token_idx] != (0, 0):
                    # Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx]
                                                   [0]:offsets[token_idx][1]]
                    split_index = stats.mode(
                        split_idxs[split_idxs != -1]).mode[0] if len(np.unique(split_idxs)) > 1 else split_idxs[0]

                    if split_index != -1:
                        if not self.get_wids:
                            label_ids.append(
                                self.labels_to_ids[word_labels[split_index]])
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        if label_ids and label_ids[-1] != -100 and self.ids_to_labels[label_ids[-1]][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if not self.get_wids:
                                label_ids.append(label_ids[-1])
                        else:
                            if not self.get_wids:
                                label_ids.append(-100)
                else:
                    if not self.get_wids:
                        label_ids.append(-100)

        encoding['labels'] = list(reversed(label_ids))

        # CONVERT TO TORCH TENSORS
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids:
            item['wids'] = torch.as_tensor(split_word_ids)

        if(self.standard):
            del item["offset_mapping"]
            return item
        else:
            return item

    def __len__(self):
        return self.len


def getSets(standard=False):
    train_df = pd.read_csv('corrected.csv')
    # CHOOSE VALIDATION INDEXES (that match my TF notebook)
    IDS = train_df.id.unique()
    print('There are', len(IDS),
          'train texts. We will split 90% 10% for validation.')

    # TRAIN VALID SPLIT 90% 10%
    np.random.seed(42)
    train_idx = np.random.choice(
        np.arange(len(IDS)), int(0.9*len(IDS)), replace=False)
    valid_idx = np.setdiff1d(np.arange(len(IDS)), train_idx)
    np.random.seed(None)

    # CREATE TRAIN SUBSET AND VALID SUBSET
    train_text_df = loadFromCSV()
    config = getConfig()
    print(train_text_df.head())
    data = train_text_df[['id', 'text', 'entities']]
    train_dataset = data.loc[data['id'].isin(
        IDS[train_idx]), ['text', 'entities']].reset_index(drop=True)
    test_dataset = data.loc[data['id'].isin(
        IDS[valid_idx])].reset_index(drop=True)

    print("FULL Dataset: {}".format(data.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(test_dataset.shape))

    tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
    training_set = dataset(train_dataset, tokenizer,
                           config['max_length'], False,standard)
    testing_set = dataset(test_dataset, tokenizer, config['max_length'], True,standard)

    if standard:
        return training_set, testing_set
    else:
        # TRAIN DATASET AND VALID DATASET
        train_params = {'batch_size': config['train_batch_size'],
                        'shuffle': True,
                        'num_workers': 2,
                        'pin_memory': True
                        }

        test_params = {'batch_size': config['valid_batch_size'],
                       'shuffle': False,
                       'num_workers': 2,
                       'pin_memory': True
                       }

        training_loader = DataLoader(training_set, **train_params)
        testing_loader = DataLoader(testing_set, **test_params)
        return train_dataset, training_loader, test_dataset, testing_loader


In [3]:
import numpy as np
import torch
import pandas as pd
from transformers import BigBirdForTokenClassification, BigBirdConfig,TrainingArguments, Trainer


output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
labels_to_ids = {v:k for k,v in enumerate(output_labels)}
ids_to_labels = {k:v for k,v in enumerate(output_labels)}

def inference(model, config, batch):
    global ids_to_labels
    # MOVE BATCH TO GPU AND INFER
    ids = batch["input_ids"].to(config['device'])
    mask = batch["attention_mask"].to(config['device'])
    outputs = model(ids, attention_mask=mask, return_dict=False)
    all_preds = torch.argmax(outputs[0], axis=-1).cpu().numpy()

    # INTERATE THROUGH EACH TEXT AND GET PRED
    predictions = []
    for k, text_preds in enumerate(all_preds):
        token_preds = [ids_to_labels[i] for i in text_preds]

        prediction = []
        word_ids = batch['wids'][k].numpy()
        previous_word_idx = -1
        for idx, word_idx in enumerate(word_ids):
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:
                prediction.append(token_preds[idx])
                previous_word_idx = word_idx
        predictions.append(prediction)

    return predictions


def get_predictions(model, config, df, loader):
    # put model in training mode
    model.eval()

    # GET WORD LABEL PREDICTIONS
    y_pred2 = []
    for batch in loader:
        labels = inference(model, config, batch)
        y_pred2.extend(labels)

    final_preds2 = []
    for i in range(len(df)):
        idx = df.id.values[i]
        # prpoch
        d = [x.replace('B-', '').replace('I-', '') for x in y_pred2[i]]
        pred = y_pred2[i]  # Leave "B" and "I"
        preds = []
        j = 0
        while j < len(pred):
            cls = pred[j]
            # The commented out line below appears to be a bug.
#             if cls == 'O': j += 1
            if cls == 'O':
                pass
            else:
                cls = cls.replace('B', 'I')  # spans start with B
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1

            if cls != 'O' and cls != '' and end - j > 7:
                final_preds2.append((idx, cls.replace('I-', ''),
                                     ' '.join(map(str, list(range(j, end))))))

            j = end

    oof = pd.DataFrame(final_preds2)
    oof.columns = ['id', 'class', 'new_predictionstring']

    return oof


def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.new_predictionstring_pred.split(' '))
    set_gt = set(row.new_predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter / len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition

    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id', 'discourse_type', 'new_predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id', 'class', 'new_predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id', 'class'],
                           right_on=['id', 'discourse_type'],
                           how='outer',
                           suffixes=('_pred', '_gt')
                           )
    joined['new_predictionstring_gt'] = joined['new_predictionstring_gt'].fillna(' ')
    joined['new_predictionstring_pred'] = joined['new_predictionstring_pred'].fillna(
        ' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5,
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])

    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (
        joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1', 'overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id', 'new_predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique()
                   if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique()
                        if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score


def inferencePipeline(test_dataset,testing_loader,model,config):  # note this doesn't run during submit
    
    train_df = pd.read_csv('corrected.csv')
    IDS = train_df.id.unique()
    np.random.seed(42)
    train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
    valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)
    np.random.seed(None)

    valid = train_df.loc[train_df['id'].isin(IDS[valid_idx])]

    # OOF PREDICTIONS
    oof = get_predictions(model,config,test_dataset, testing_loader)

    # COMPUTE F1 SCORE
    f1s = []
    CLASSES = oof['class'].unique()
    print()
    for c in CLASSES:
        pred_df = oof.loc[oof['class'] == c].copy()
        gt_df = valid.loc[valid['discourse_type'] == c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        print(c, f1)
        f1s.append(f1)
    print()
    print('Overall', np.mean(f1s))
    print()
    return np.mean(f1s)

In [4]:
import os
from tqdm import tqdm
def load_files(path):
    # Load the text files from the dir and build a Dataframe.
    names,text=[],[]
    for f in tqdm(list(os.listdir(path))):
        names.append(f.replace('.txt',''))
        text.append(open(path+f,'r').read())
    texts=pd.DataFrame({"id":names,"text":text})
    return texts

In [5]:
config = getConfig()
model = AutoModelForTokenClassification.from_pretrained(config['model_name'],num_labels=15)
model.to(config["device"])
testDF=load_files("../input/feedback-prize-2021/test/")
test_params = {'batch_size': config['valid_batch_size'],
                   'shuffle': False,
                   'num_workers': 2,
                   'pin_memory': True
                   }
tokenizer = AutoTokenizer.from_pretrained(config['token_name'])
test_texts_set = dataset(testDF, tokenizer, config['max_length'], True)
test_texts_loader = DataLoader(test_texts_set, **test_params)
sub = get_predictions(model, config, testDF, test_texts_loader)
sub.to_csv("submission.csv",index=False)

100%|██████████| 5/5 [00:00<00:00, 175.12it/s]

{0: 'O', 1: 'B-Lead', 2: 'I-Lead', 3: 'B-Position', 4: 'I-Position', 5: 'B-Claim', 6: 'I-Claim', 7: 'B-Counterclaim', 8: 'I-Counterclaim', 9: 'B-Rebuttal', 10: 'I-Rebuttal', 11: 'B-Evidence', 12: 'I-Evidence', 13: 'B-Concluding Statement', 14: 'I-Concluding Statement'}



To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:461.)
  return torch.floor_divide(self, other)
