In [1]:
import typing as t
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm
from transformers.models.auto.tokenization_auto import AutoTokenizer

In [2]:
tqdm.pandas()

In [3]:
DATASET_DIR = Path('/root/data')
FP_DATASET_DIR = DATASET_DIR / 'feedback-prize-effectiveness'

In [4]:
!ls -la $FP_DATASET_DIR

total 125520
drwxrwxr-x 4 root root      4096 Jul  4 00:40 .
drwxrwxr-x 3 root root      4096 Jul  3 18:14 ..
-rw-rw-r-- 1 root root       306 Jun 20 09:15 sample_submission.csv
drwxrwxr-x 2 root root      4096 Jul  3 14:14 test
-rw-rw-r-- 1 root root      2632 Jun 20 09:15 test.csv
drwxrwxr-x 2 root root    151552 Jul  3 14:14 train
-rw-rw-r-- 1 root root  10908376 Jun 20 09:15 train.csv
-rw-r--r-- 1 root root 106346745 Jul  4 00:40 train_ext.csv
-rw-rw-r-- 1 root root  11099291 Jun 27 13:49 train_with_pos.csv


In [5]:
!ls -la $FP_DATASET_DIR/train | head -n 10

total 18692
drwxrwxr-x 2 root root 151552 Jul  3 14:14 .
drwxrwxr-x 4 root root   4096 Jul  4 00:40 ..
-rw-rw-r-- 1 root root   3590 Jun 20 09:15 00066EA9880D.txt
-rw-rw-r-- 1 root root   1527 Jun 20 09:15 000E6DE9E817.txt
-rw-rw-r-- 1 root root   1395 Jun 20 09:15 0016926B079C.txt
-rw-rw-r-- 1 root root   4568 Jun 20 09:15 00203C45FC55.txt
-rw-rw-r-- 1 root root   1551 Jun 20 09:15 0029F4D19C3F.txt
-rw-rw-r-- 1 root root   1090 Jun 20 09:15 0045BE2791A2.txt
-rw-rw-r-- 1 root root   1846 Jun 20 09:15 004AC288D833.txt
ls: write error: Broken pipe


In [6]:
train_df = pd.read_csv(FP_DATASET_DIR / 'train_with_pos.csv')

In [7]:
train_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_token_len,pos
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,84,0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,50,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,25,2
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,93,3
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,23,4
...,...,...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,25,0
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,12,1
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,27,2
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,93,3


In [8]:
def _get_essay_text(essay_id: str) -> str:
    with open(FP_DATASET_DIR / f'train/{essay_id}.txt') as f:
        return f.read().strip()

train_df['essay_text'] = train_df['essay_id'].progress_apply(_get_essay_text)

  0%|          | 0/36765 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# class _TokInputV1Getter:

#     def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer):
#         self._df = df
#         self._tokenizer = tokenizer

#     def __call__(self, row: t.Dict[str, t.Any]) -> str:
#         (
#             id,
#             disc_type,
#             text,
#             target,
#             essay_id,
#             essay_text,
#          ) = (
#             str(row['discourse_id']),
#             str(row['discourse_type']),
#             str(row['discourse_text']),
#             str(row['discourse_effectiveness']),
#             str(row['essay_id']),
#             str(row['essay_text']),
#          )
#         sep = self._tokenizer.sep_token
#         other_disc_str = ', '.join([
#             row['discourse_type']
#             for _, row in self._df[self._df['essay_id'] == essay_id].sort_values('pos').iterrows()
#         ])
#         return f'{disc_type} {sep} {other_disc_str} {sep} {text}'


# _tok_input_v1_getter = _TokInputV1Getter(train_df, tokenizer)
# train_df['tokenizer_input_v1'] = train_df.progress_apply(_tok_input_v1_getter, axis=1)

In [11]:
class _TokInputV2Getter:
    _DISC_TYPE_SEP = '[TYPE]'
    _DISC_PAIR_SEP = '[PAIR]'

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer):
        self._df = df
        self._tokenizer = tokenizer

    def __call__(self, row: t.Dict[str, t.Any]) -> str:
        (
            id,
            disc_type,
            text,
            target,
            essay_id,
            essay_text,
         ) = (
            str(row['discourse_id']),
            str(row['discourse_type']),
            str(row['discourse_text']),
            str(row['discourse_effectiveness']),
            str(row['essay_id']),
            str(row['essay_text']),
         )
        sep = self._tokenizer.sep_token
        this_disc_str = f'{disc_type} {self._DISC_TYPE_SEP} {text}'
        other_disc_str = f' {self._DISC_PAIR_SEP} '.join([
            str(row['discourse_type']) + f' {self._DISC_TYPE_SEP} ' + str(row['discourse_text'])
            for _, row in self._df[self._df['essay_id'] == essay_id].sort_values('pos').iterrows()
        ])
        return f'{this_disc_str} {sep} {other_disc_str}'


_tok_input_v2_getter = _TokInputV2Getter(train_df, tokenizer)
# train_df['tokenizer_input_v2'] = train_df.progress_apply(_tok_input_v2_getter, axis=1)

In [18]:
class _TokInputV3Getter:

    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, convert_to_lowercase: bool = False):
        self._df = df
        self._tokenizer = tokenizer
        self._convert_to_lowercase = convert_to_lowercase

    def __call__(self, row: t.Dict[str, t.Any]) -> str:
        (
            id,
            disc_type,
            text,
            target,
            essay_id,
         ) = (
            str(row['discourse_id']),
            str(row['discourse_type']),
            str(row['discourse_text']),
            str(row['discourse_effectiveness']),
            str(row['essay_id']),
         )
        sep = self._tokenizer.sep_token
        essay_text = f'\n'.join([
            str(row['discourse_text'])
            for _, row in self._df[(self._df['essay_id'] == essay_id) & (self._df['discourse_id'] != id)].sort_values('pos').iterrows()
        ])
        result = f'{disc_type} {sep} {text} {sep} {essay_text}'
        if self._convert_to_lowercase:
            result = result.lower()
        return result

In [None]:
train_df['tokenizer_input_v3'] = train_df.progress_apply(_TokInputV3Getter(train_df, tokenizer), axis=1)

In [19]:
train_df['tokenizer_input_v3_lower'] = train_df.progress_apply(_TokInputV3Getter(train_df, tokenizer, convert_to_lowercase=True), axis=1)

  0%|          | 0/36765 [00:00<?, ?it/s]

In [20]:
class _TokLenGetter:

    def __init__(self, tokenizer: AutoTokenizer):
        self._tokenizer = tokenizer

    def __call__(self, text: str) -> int:
        return len(self._tokenizer(text)['input_ids'])


_tok_len_getter = _TokLenGetter(tokenizer)

# train_df['discourse_text_len'] = train_df['discourse_text'].progress_apply(_tok_len_getter)
# train_df['essay_text_len'] = train_df['essay_text'].progress_apply(_tok_len_getter)
# train_df['tokenizer_input_v2_len'] = train_df['tokenizer_input_v2'].progress_apply(_tok_len_getter)
train_df['tokenizer_input_v3_len'] = train_df['tokenizer_input_v3'].progress_apply(_tok_len_getter)
train_df['tokenizer_input_v3_lower_len'] = train_df['tokenizer_input_v3_lower'].progress_apply(_tok_len_getter)

  0%|          | 0/36765 [00:00<?, ?it/s]

  0%|          | 0/36765 [00:00<?, ?it/s]

In [21]:
train_df['tokenizer_input_v3_len'].describe()

count    36765.000000
mean       505.793635
std        243.457717
min         32.000000
25%        319.000000
50%        449.000000
75%        640.000000
max       1597.000000
Name: tokenizer_input_v3_len, dtype: float64

In [22]:
train_df['tokenizer_input_v3_lower_len'].describe()

count    36765.000000
mean       510.538964
std        243.705613
min         36.000000
25%        324.000000
50%        454.000000
75%        646.000000
max       1605.000000
Name: tokenizer_input_v3_lower_len, dtype: float64

In [17]:
len(train_df[train_df['tokenizer_input_v3_len'] > 1024]) / len(train_df)

0.046130830953352374

In [None]:
train_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_text_token_len,pos,essay_text,tokenizer_input_v2,tokenizer_input_v2_len
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,84,0,"Hi, i'm Isaac, i'm going to be writing about h...","Lead [TYPE] Hi, i'm Isaac, i'm going to be wri...",591
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,50,1,"Hi, i'm Isaac, i'm going to be writing about h...","Position [TYPE] On my perspective, I think tha...",557
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,25,2,"Hi, i'm Isaac, i'm going to be writing about h...",Claim [TYPE] I think that the face is a natura...,532
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,93,3,"Hi, i'm Isaac, i'm going to be writing about h...","Evidence [TYPE] If life was on Mars, we would ...",600
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,23,4,"Hi, i'm Isaac, i'm going to be writing about h...",Counterclaim [TYPE] People thought that the fa...,531
...,...,...,...,...,...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,25,0,Some people may ask multiple people for advice...,Claim [TYPE] For many people they don't like o...,228
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,12,1,Some people may ask multiple people for advice...,Claim [TYPE] also people have different views ...,215
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,27,2,Some people may ask multiple people for advice...,Position [TYPE] Advice is something that can i...,230
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,93,3,Some people may ask multiple people for advice...,Evidence [TYPE] someone can use everything tha...,296
