In [1]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
essay = pd.read_csv('../data/essay_processed.csv')
essay = essay.set_index('essay_id').squeeze()
essay

essay_id
007ACE74B050    hi, i'm isaac, i'm going to be writing about h...
00944C693682    limiting the usage of cars has personal and pr...
00BD97EA4041    should computers read the emotional expression...
00C6E82FE5BA    i think that it wouldn't be valueable to have ...
013B9AA6B9DB    what is that thing on mars? well, some people ...
                                      ...                        
FDF0AEEB14C3    going to school everyday can be difficult for ...
FE3CA06DDCA1    why is it when someone asks you for advice the...
FEF42864AE28    during a long day at school, have you ever tho...
FF9E0379CD98    some school offer distence learning as a optio...
FFA381E58FC6    some people may ask multiple people for advice...
Name: essay_text, Length: 4191, dtype: object

In [3]:
train = pd.read_csv('../data/train_processed.csv')
train

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_processed,kfold
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"hi, i'm isaac, i'm going to be writing about h...",2
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"on my perspective, i think that the face is a ...",2
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,i think that the face is a natural landform be...,2
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"if life was on mars, we would know by now. the...",2
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,people thought that the face was formed by ali...,2
...,...,...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,for many people they don't like only asking on...,1
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,also people have different views and opinions ...,1
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,advice is something that can impact a persons ...,1
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,someone can use everything that many people sa...,1


In [4]:
def get_tag(discourse_type):
    return f'<{discourse_type.lower()}>'

def insert_tag(text, dtext, dtype, start=0):
    tag = get_tag(dtype)
    sidx = text.find(dtext, start)
    if sidx == -1:
        raise KeyError
    text = text[:sidx] + ' ' + get_tag(dtype) + ' ' + text[sidx:]
    eidx = sidx + len(' ' + get_tag(dtype) + ' ') + len(dtext)
    return text, sidx, eidx

In [36]:
def prepare_data_token_cls(essay, train, tokenizer):
    samples = []
    for eid in tqdm(essay.index):
        text = essay[eid]
        df = train[train['essay_id']==eid]
        idxs = []
        labels = []
        eidx = 0
        for _, row in df.iterrows():
            dtype = row['discourse_type']
            dtext = row['discourse_text_processed']
            label = LABEL_MAPPING[row['discourse_effectiveness']]
            text, sidx, eidx = insert_tag(text, dtext, dtype, start=eidx)
            idxs.append([sidx, eidx])
            labels.append(label)
        assert(idxs == list(sorted(idxs))), idxs
        assert df['kfold'].nunique() == 1, df['kfold'].nunique()
        samples.append({'text': text, 'spans': idxs, 'raw_labels': labels, 'fold': df['kfold'].unique()[0]})
    for sample in tqdm(samples):
        enc = tokenizer(sample['text'], return_offsets_mapping=True, add_special_tokens=False)
        seq_len = len(enc['input_ids'])
        label = [-100 for _ in range(seq_len)]
        # 1. mean
        # for i in range(seq_len):
        #     for j, (s, e) in enumerate(sample['spans']):
        #         if enc['offset_mapping'][i][0] >= s and enc['offset_mapping'][i][0] < e and e > s:
        #             label[i] = sample['raw_labels'][j]
        #             break
        
        # 2. cls
        j = 0
        label_positions = []
        for i in range(seq_len):
            if j == len(sample['raw_labels']):
                break
            s, e = sample['spans'][j]
            if enc['offset_mapping'][i][0] >= s and e > s:
                label[i] = sample['raw_labels'][j]
                j += 1
                label_positions.append(i)
        sample['label_positions'] = label_positions
        sample['label'] = label
        for k, v in enc.items():
            sample[k] = v
        nlabel_assigned = len([l for l in sample['label'] if l != -100])
        assert(nlabel_assigned==len(sample['raw_labels'])), f"{nlabel_assigned}, {len(sample['raw_labels'])}"
    return samples

In [71]:
samples = []
for eid in tqdm(essay.index):
    text = essay[eid]
    df = train[train['essay_id']==eid]
    idxs = []
    labels = []
    eidx = 0
    for _, row in df.iterrows():
        dtype = row['discourse_type']
        dtext = row['discourse_text_processed']
        label = LABEL_MAPPING[row['discourse_effectiveness']]
        text, sidx, eidx = insert_tag(text, dtext, dtype, start=eidx)
        idxs.append([sidx, eidx])
        labels.append(label)
    assert(idxs == list(sorted(idxs))), idxs
    assert df['kfold'].nunique() == 1, df['kfold'].nunique()
    samples.append({'text': text, 'spans': idxs, 'raw_labels': labels, 'fold': df['kfold'].unique()[0]})

100%|██████████████████████████████████████| 4191/4191 [00:06<00:00, 669.33it/s]


In [80]:
enc = tokenizer(samples[3218]['text'], return_offsets_mapping=True, add_special_tokens=False, max_length=1024, return_overflowing_tokens=True, stride=256, truncation=True)
enc

{'input_ids': [[2569, 23960, 1504, 264, 262, 565, 263, 262, 565, 280, 268, 658, 261, 262, 2187, 265, 316, 1574, 516, 282, 757, 3153, 263, 16839, 260, 283, 262, 3063, 265, 262, 565, 261, 355, 516, 2544, 272, 3628, 760, 264, 413, 719, 265, 3146, 263, 3353, 263, 262, 384, 262, 658, 29187, 261, 269, 266, 568, 272, 634, 264, 282, 708, 265, 263, 854, 3287, 260, 262, 1574, 269, 264, 1643, 262, 355, 265, 315, 658, 371, 275, 52239, 51881, 61566, 272, 295, 3958, 264, 957, 262, 634, 265, 262, 658, 261, 2910, 262, 11992, 1575, 1015, 264, 638, 310, 263, 310, 10425, 264, 421, 328, 1647, 260, 2569, 19573, 1504, 278, 516, 282, 1594, 264, 957, 262, 634, 265, 1175, 266, 52239, 51881, 10339, 1574, 261, 266, 310, 12923, 2187, 263, 262, 5125, 2648, 516, 282, 1372, 1356, 417, 899, 354, 262, 58189, 260, 267, 340, 1023, 261, 262, 11992, 1575, 516, 282, 1594, 264, 638, 310, 2335, 1356, 262, 3063, 260, 2569, 48819, 1504, 262, 9074, 1603, 265, 25179, 261, 269, 265, 586, 266, 2818, 658, 275, 614, 7300, 265, 467, 

In [77]:
samples = prepare_data_token_cls(essay, train, tokenizer)

100%|██████████████████████████████████████| 4191/4191 [00:06<00:00, 682.24it/s]
100%|██████████████████████████████████████| 4191/4191 [00:04<00:00, 954.88it/s]


In [79]:
max([len(s['input_ids']) for s in samples]), np.argmax([len(s['input_ids']) for s in samples])

(1649, 3218)

In [39]:
print(samples[2])

{'text': ' <lead> should computers read the emotional expressions of students in a classroom?  <position> no because, why should a computer know how your feeling?  <claim> it wouldn\'t change the emotion the students feeling. it also wouldn\'t help with the students education. its over all just a waste of time.  <evidence> the process begins when the computer puts together a 3-d digital model. there are 44 major muscles in your face that the computer has to detect. eckman classified six emotions happiness, surprise, anger, disgust, fear, and sadness. he then " associated each with characteristic movements of facial muscles." for example the frontalis pars lateralis muscle is above your eye which shows your surpried when it is raised. us humans alone can identify facial expressions on peoples faces. if you look at a friend you can tell how they are feeling at that moment. da vinci studied human anatomy to help paint the facial muscles percisely on the mona lisa painting. dr. huangs has 

In [13]:
for f in range(5):
    samples_f = [s for s in samples if s['fold'] == f]
    print(f, len(samples_f))

0 838
1 841
2 836
3 840
4 836


In [47]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=3)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

In [68]:
import numpy as np
import torch
from sklearn.metrics import log_loss
from scipy.special import softmax
def eval_token_cls_model(model, samples, device="cuda"):
    model = model.to(device)
    predictions = []
    labels = []
    for sample in tqdm(samples):
        input = torch.tensor(sample['input_ids']).unsqueeze(0).to(device)
        logits = model(input).logits.squeeze()
        label_idxs = torch.tensor(sample['label_positions'])
        prediction = logits[label_idxs].cpu().detach().numpy()
        predictions.append(prediction)
        labels += sample['raw_labels']
    predictions = np.vstack(predictions)
    probs = softmax(predictions, axis=1)
    probs = np.clip(probs, 1e-15, 1 - 1e-15)
    score = log_loss(labels, probs, labels=[0,1,2])
    return score
eval_token_cls_model(model, samples[:10], 'cpu')

100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  2.73it/s]


1.2543640604073352