In [1]:
import typing as t
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
tqdm.pandas()

In [3]:
DATA_DIR = Path('/root/data')
DATASET_DIR = DATA_DIR / 'datasets'
OOF_DIR = DATA_DIR / 'oof'
FP_DATASET_DIR = DATASET_DIR / 'feedback-prize-effectiveness'
FP_2021_DATASET_DIR = DATASET_DIR / 'feedback-prize-2021'

In [4]:
!ls -la $FP_2021_DATASET_DIR/train | head -n 10

total 68136
drwxrwxr-x 2 root root 528384 Jul 24 12:41 .
drwxrwxr-x 3 root root   4096 Jul 24 13:21 ..
-rw-rw-r-- 1 root root   1343 Dec 10  2021 0000D23A521A.txt
-rw-rw-r-- 1 root root   3590 Dec 10  2021 00066EA9880D.txt
-rw-rw-r-- 1 root root   1527 Dec 10  2021 000E6DE9E817.txt
-rw-rw-r-- 1 root root   2707 Dec 10  2021 001552828BD0.txt
-rw-rw-r-- 1 root root   1395 Dec 10  2021 0016926B079C.txt
-rw-rw-r-- 1 root root   1097 Dec 10  2021 0019E4D09427.txt
-rw-rw-r-- 1 root root   2157 Dec 10  2021 001A03E06F3C.txt
ls: write error: Broken pipe


In [5]:
train_df = pd.read_csv(FP_DATASET_DIR / 'train_ext.csv')
fp_2021_train_df = pd.read_csv(FP_2021_DATASET_DIR / 'train.csv')

In [6]:
train_df = train_df.drop(['discourse_text_token_len', 'discourse_text_len', 'essay_text_len'], axis=1)

In [7]:
train_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,pos,essay_text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,0,"Hi, i'm Isaac, i'm going to be writing about h..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,1,"Hi, i'm Isaac, i'm going to be writing about h..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,2,"Hi, i'm Isaac, i'm going to be writing about h..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,3,"Hi, i'm Isaac, i'm going to be writing about h..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,4,"Hi, i'm Isaac, i'm going to be writing about h..."
...,...,...,...,...,...,...,...
36760,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,0,Some people may ask multiple people for advice...
36761,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,1,Some people may ask multiple people for advice...
36762,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,2,Some people may ask multiple people for advice...
36763,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,3,Some people may ask multiple people for advice...


In [8]:
fp_2021_train_df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [9]:
fp_2021_train_df['discourse_id'] = fp_2021_train_df['discourse_id'].progress_apply(lambda x: str(int(x)))

  0%|          | 0/144293 [00:00<?, ?it/s]

In [11]:
fp_2021_train_df = fp_2021_train_df.rename({'id': 'essay_id'}, axis=1)

In [10]:
str(int(fp_2021_train_df.iloc[0]['discourse_id']))

'1622627660524'

In [12]:
def _get_essay_text(essay_id: str) -> str:
    with open(FP_2021_DATASET_DIR / f'train/{essay_id}.txt') as f:
        return f.read().strip()

fp_2021_train_df['essay_text'] = fp_2021_train_df['essay_id'].progress_apply(_get_essay_text)

  0%|          | 0/144293 [00:00<?, ?it/s]

In [13]:
class _PosGetter:

    def __init__(self):
        self._curr_essay_id: str | None = None
        self._curr_pos: int = 0

    def __call__(self, essay_id: str) -> int:
        if essay_id == self._curr_essay_id:
            self._curr_pos += 1
        else:
            self._curr_essay_id = essay_id
            self._curr_pos = 0
        return self._curr_pos

fp_2021_train_df['pos'] = fp_2021_train_df['essay_id'].progress_apply(_PosGetter())

  0%|          | 0/144293 [00:00<?, ?it/s]

In [16]:
fp_2021_train_df = fp_2021_train_df[['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'pos', 'essay_text']]

In [17]:
fp_2021_train_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,pos,essay_text
0,1622627660524,423A1CA112E2,Modern humans today are always on their phone....,Lead,0,Phones\n\nModern humans today are always on th...
1,1622627653021,423A1CA112E2,They are some really bad consequences when stu...,Position,1,Phones\n\nModern humans today are always on th...
2,1622627671020,423A1CA112E2,Some certain areas in the United States ban ph...,Evidence,2,Phones\n\nModern humans today are always on th...
3,1622627696365,423A1CA112E2,"When people have phones, they know about certa...",Evidence,3,Phones\n\nModern humans today are always on th...
4,1622627759780,423A1CA112E2,Driving is one of the way how to get around. P...,Claim,4,Phones\n\nModern humans today are always on th...
...,...,...,...,...,...,...
144288,1618153340639,4C471936CD75,if I'm not sure what college I want to attend...,Evidence,4,"In ancient times, and also still today in some..."
144289,1618153383399,4C471936CD75,seeking multiple opinions before making a har...,Evidence,5,"In ancient times, and also still today in some..."
144290,1618024996127,4C471936CD75,it is better to seek multiple opinions instead...,Position,6,"In ancient times, and also still today in some..."
144291,1618025268756,4C471936CD75,The impact of asking people to help you make a...,Evidence,7,"In ancient times, and also still today in some..."


In [24]:
len(set(fp_2021_train_df['essay_id'].unique()))

15594

In [23]:
len(set(fp_2021_train_df['essay_id'].unique()) - set(train_df['essay_id'].unique()))

11403

In [25]:
nooverlap_essay_id_set = set(fp_2021_train_df['essay_id'].unique()) - set(train_df['essay_id'].unique())
fp_2021_nooverlap_train_df = fp_2021_train_df[fp_2021_train_df['essay_id'].isin(nooverlap_essay_id_set)]

In [26]:
fp_2021_nooverlap_train_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,pos,essay_text
10,1622575848614,A8445CABFECE,Drivers should not be able to use phones while...,Position,0,Phones & Driving\n\nDrivers should not be able...
11,1622575854436,A8445CABFECE,Drivers who used their phone while operating a...,Claim,1,Phones & Driving\n\nDrivers should not be able...
12,1622575912464,A8445CABFECE,According to an article by the Edgar Snyder Fi...,Evidence,2,Phones & Driving\n\nDrivers should not be able...
13,1622575919138,A8445CABFECE,"In conclusion, drivers should not able to work...",Concluding Statement,3,Phones & Driving\n\nDrivers should not be able...
14,1622644274071,6B4F7A0165B9,The ability to stay connected to people we kno...,Lead,0,Cell Phone Operation While Driving\n\nThe abil...
...,...,...,...,...,...,...
144279,1617802754835,AFEC37C2D43F,misdeeds are when the advice-giver is purpose...,Claim,9,There has been at least one point in everyone'...
144280,1617802760391,AFEC37C2D43F,An example of this is when you ask Generic_Nam...,Evidence,10,There has been at least one point in everyone'...
144281,1617802825492,AFEC37C2D43F,"Now, I know what you probably saying ""But what...",Counterclaim,11,There has been at least one point in everyone'...
144282,1617802831534,AFEC37C2D43F,what are the odds that seven of your close fr...,Rebuttal,12,There has been at least one point in everyone'...


In [None]:
fp_2021_nooverlap_train_df.to_csv(FP_DATASET_DIR / 'pl-2021.csv', index=False)