In [9]:
import random
import typing as t
from pathlib import Path

import pandas as pd

In [12]:
DATASET_DIR_PATH = Path('/root/data/datasets/feedback-prize-effectiveness')
TRAIN_CSV_PATH = DATASET_DIR_PATH / 'TRAIN_DB.csv'
TRAIN_ESSAY_TEXT_DIR_PATH = DATASET_DIR_PATH / 'train'

In [11]:
!ls -la $DATASET_DIR_PATH

total 425536
drwxrwxr-x 4 root root      4096 Jul 24 14:35 .
drwxrwxr-x 4 root root      4096 Jul 24 12:41 ..
-rw-r--r-- 1 root root 307210513 Jul 24 14:35 pl-2021.csv
-rw-rw-r-- 1 root root       306 Jun 20 09:15 sample_submission.csv
drwxrwxr-x 2 root root      4096 Jul  3 14:14 test
-rw-rw-r-- 1 root root      2632 Jun 20 09:15 test.csv
drwxrwxr-x 2 root root    151552 Jul  3 14:14 train
-rw-rw-r-- 1 root root  10908376 Jun 20 09:15 train.csv
-rw-r--r-- 1 root root 106346745 Jul  4 00:40 train_ext.csv
-rw-rw-r-- 1 root root  11099291 Jun 27 13:49 train_with_pos.csv


In [76]:
def get_discourse_slice_list(essay_text: str, disc_text_list: t.List[str]) -> t.List[t.Tuple[int, int]]:
    slice_list: t.List[t.Tuple[int, int]] = []
    last_pos_end = 0
    for disc_text in disc_text_list:
        disc_text = disc_text.strip()
        start_idx = essay_text.find(disc_text, last_pos_end)
        end_idx = start_idx + len(disc_text)
        slice_list.append((start_idx, end_idx))
        last_pos_end = end_idx
    return slice_list


def _disc_type_to_tok(dt: str, tok_type: t.Literal['CLS'] | t.Literal['END']) -> str:
    return '[' + dt.upper().replace(' ', '_') + f'_{tok_type}]'


def _disc_type_to_cls_tok(dt: str) -> str:
    return _disc_type_to_tok(dt, 'CLS')


def _disc_type_to_end_tok(dt: str) -> str:
    return _disc_type_to_tok(dt, 'END')


def insert_label_tokens(
        essay_text: str,
        disc_type_list: t.List[str],
        disc_slice_list: t.List[t.Tuple[int, int]]) -> t.Tuple[str, t.List[int]]:
    last_end = 0
    new_essay_text = ''
    cls_char_idx_list = []
    for disc_type, (disc_start, disc_end) in zip(disc_type_list, disc_slice_list):
        if disc_start > last_end:
            new_essay_text += essay_text[last_end:disc_start]
        cls_char_idx = len(new_essay_text)
        new_essay_text += _disc_type_to_cls_tok(disc_type)
        new_essay_text += f' {essay_text[disc_start:disc_end]} '
        new_essay_text += f'{_disc_type_to_end_tok(disc_type)}'
        cls_char_idx_list.append(cls_char_idx)
        last_end = disc_end
    return new_essay_text, cls_char_idx_list

In [5]:
train_df = pd.read_csv(TRAIN_CSV_PATH)

In [6]:
train_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate
...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective


In [100]:
def _test():
    essay_id = random.choice(train_df['essay_id'].unique())
    print(f'essay_id = {essay_id}')
    with open(TRAIN_ESSAY_TEXT_DIR_PATH / f'{essay_id}.txt') as f:
        essay_text = f.read().strip()
    disc_type_list = train_df[train_df['essay_id'] == essay_id]['discourse_type'].tolist()
    disc_text_list = train_df[train_df['essay_id'] == essay_id]['discourse_text'].tolist()
    disc_slice_list = get_discourse_slice_list(essay_text=essay_text, disc_text_list=disc_text_list)
    new_essay_text, cls_end_char_idx = insert_label_tokens(essay_text, disc_type_list, disc_slice_list)
    print(len(disc_text_list), len(disc_slice_list))
    print([(new_essay_text[cls_idx - 3:cls_idx + 3], new_essay_text[end_idx - 3: end_idx + 3]) for cls_idx, end_idx in cls_end_char_idx])
    print([(new_essay_text[cls_idx], new_essay_text[end_idx]) for cls_idx, end_idx in cls_end_char_idx])
    print()
    print('=== ORIGINAL ===')
    print(essay_text)
    print()
    print('=== PREPROCESSED ===')
    print(new_essay_text)

_test()

essay_id = CC787397F33B
8 8
[(',\n\n[PO', 'g. [PO'), ('D] [CL', 'e. [CL'), (']\n\n[CL', 'le [CL'), (']. [EV', 'g. [EV'), (']\n\n[CO', 'g? [CO'), ('D] [RE', 'n. [RE'), ('e. [EV', 'e. [EV'), (']\n\n[CO', 'e. [CO')]
[('[', '['), ('[', '['), ('[', '['), ('[', '['), ('[', '['), ('[', '['), ('[', '['), ('[', '[')]

=== ORIGINAL ===
Dear Senator,

I along with many other citizens have came to the conclusion that the Electoral College worth keeping. Though, the process itself has loose ends, overall it benefits the state and national election. The system uses general influence and a representative style of voting that makes the ballot more stable and dependable.

As a start, Electoral College electors are decided upon by the people. This means an elector with similar beliefs to the citizens will be chosen and most likely decide their vote in consideration of popular demand. Each state is given representatives in proportion to the poulation of the state. These people can be anyone who does not 