In [15]:
import math
import typing as t
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

In [12]:
tqdm.pandas()

In [2]:
DATASET_DIR_PATH = Path('/root/data/datasets/fp-0p925')

In [3]:
!ls -la $DATASET_DIR_PATH

total 353676
drwxrwxr-x 2 root root      4096 Aug 11 14:03  .
drwxrwxr-x 5 root root      4096 Aug 11 14:04  ..
-rw-rw-r-- 1 root root 181093973 Jul 30 12:29  TRAIN_DB.csv
-rw-r--r-- 1 root root  36381617 Aug 11 14:00 'prediction[0].csv'
-rw-r--r-- 1 root root  37668899 Aug 11 14:01 'prediction[1].csv'
-rw-r--r-- 1 root root  36260154 Aug 11 14:01 'prediction[2].csv'
-rw-r--r-- 1 root root  35455520 Aug 11 14:02 'prediction[3].csv'
-rw-r--r-- 1 root root  35278414 Aug 11 14:02 'prediction[4].csv'


In [4]:
train_db_df = pd.read_csv(DATASET_DIR_PATH / 'TRAIN_DB.csv')

In [31]:
class _LogitToProbPeprocessor:
    _COL_LIST = ['Ineffective', 'Adequate', 'Effective']

    def __init__(self, col: str):
        self._col = col

    def __call__(self, row: t.Dict[str, float]) -> float:
        return math.exp(row[self._col]) / sum([math.exp(row[col]) for col in self._COL_LIST])


def create_pos_col(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['pos'] = -1
    for essay_id in tqdm(df['essay_id'].unique(), desc='Computing `pos` column'):
        for i, disc_id in enumerate(df[df['essay_id'] == essay_id]['discourse_id'].tolist()):
            df.loc[df['discourse_id'] == disc_id, 'pos'] = i
    assert -1 not in set(df['pos'].unique())
    return df


def preprocess_pred_part(df: pd.DataFrame) -> pd.DataFrame:
    df['score_ineffective'] = df.progress_apply(_LogitToProbPeprocessor('Ineffective'), axis=1)
    df['score_adequate'] = df.progress_apply(_LogitToProbPeprocessor('Adequate'), axis=1)
    df['score_effective'] = df.progress_apply(_LogitToProbPeprocessor('Effective'), axis=1)
    df = create_pos_col(df)
    return df


def create_pred_df(dataset_dir_path: Path, num_folds: int) -> pd.DataFrame:
    return pd.concat([
        preprocess_pred_part(pd.read_csv(dataset_dir_path / f'prediction[{i}].csv'))
        for i in range(num_folds)
    ])

In [21]:
pred_df = create_pred_df(dataset_dir_path=DATASET_DIR_PATH, num_folds=5)

  0%|          | 0/6884 [00:00<?, ?it/s]

  0%|          | 0/6884 [00:00<?, ?it/s]

  0%|          | 0/6884 [00:00<?, ?it/s]

Computing `pos` column:   0%|          | 0/773 [00:00<?, ?it/s]

  0%|          | 0/6796 [00:00<?, ?it/s]

  0%|          | 0/6796 [00:00<?, ?it/s]

  0%|          | 0/6796 [00:00<?, ?it/s]

Computing `pos` column:   0%|          | 0/773 [00:00<?, ?it/s]

  0%|          | 0/6827 [00:00<?, ?it/s]

  0%|          | 0/6827 [00:00<?, ?it/s]

  0%|          | 0/6827 [00:00<?, ?it/s]

Computing `pos` column:   0%|          | 0/773 [00:00<?, ?it/s]

  0%|          | 0/6780 [00:00<?, ?it/s]

  0%|          | 0/6780 [00:00<?, ?it/s]

  0%|          | 0/6780 [00:00<?, ?it/s]

Computing `pos` column:   0%|          | 0/774 [00:00<?, ?it/s]

  0%|          | 0/6801 [00:00<?, ?it/s]

  0%|          | 0/6801 [00:00<?, ?it/s]

  0%|          | 0/6801 [00:00<?, ?it/s]

Computing `pos` column:   0%|          | 0/774 [00:00<?, ?it/s]

In [22]:
pred_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay,before,after,label,fold,Ineffective,Adequate,Effective,score_ineffective,score_adequate,score_effective,pos
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...",NO,"On my perspective, I think that the face is a ...",1,0,0.191439,2.382967,-1.575415,0.098817,0.884299,0.016885,0
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","Hi, i'm Isaac, i'm going to be writing about h...",m\n\nI think that the face is a natural landfo...,1,0,0.267513,2.655187,-2.145237,0.083488,0.909034,0.007478,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","Hi, i'm Isaac, i'm going to be writing about h...","se next few paragraphs, I'll be talking about ...",1,0,0.504716,2.831645,-2.550461,0.088546,0.907282,0.004172,2
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","Hi, i'm Isaac, i'm going to be writing about h...",.\n\nPeople thought that the face was formed b...,1,0,1.570769,2.148782,-3.007700,0.358068,0.638254,0.003678,3
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h...","Hi, i'm Isaac, i'm going to be writing about h...","though some say that life on Mars does exist, ...",1,0,1.441342,2.352720,-3.164890,0.285899,0.711245,0.002856,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6796,9f63b687e76a,FFA381E58FC6,For many people they don't like only asking on...,Claim,Adequate,Some people may ask multiple people for advice...,Some people may ask multiple people for advice...,"e, also people have different views and opinio...",1,4,-1.096351,2.255865,-0.321535,0.031510,0.900108,0.068382,0
6797,9d5bd7d86212,FFA381E58FC6,also people have different views and opinions ...,Claim,Adequate,Some people may ask multiple people for advice...,Some people may ask multiple people for advice...,.\n\nAdvice is something that can impact a per...,1,4,-0.182667,2.381543,-1.295477,0.069837,0.907212,0.022951,1
6798,f1b78becd573,FFA381E58FC6,Advice is something that can impact a persons ...,Position,Adequate,Some people may ask multiple people for advice...,Some people may ask multiple people for advice...,y.\n\nSpeaking to more than one person to get ...,1,4,0.799952,1.667235,-1.870000,0.289881,0.690043,0.020076,2
6799,cc184624ca8e,FFA381E58FC6,someone can use everything that many people sa...,Evidence,Ineffective,Some people may ask multiple people for advice...,Some people may ask multiple people for advice...,y. In conclusion asking for an opinion can be ...,0,4,0.478447,1.551168,-1.065458,0.241729,0.706651,0.051620,3


In [23]:
pred_df.to_csv(DATASET_DIR_PATH / 'train_meta.csv', index=False)

In [36]:
test_db_df = pd.read_csv(DATASET_DIR_PATH / 'TEST_DB.csv')
test_pred_df = pd.read_csv(DATASET_DIR_PATH / 'predicted_test_by_v5DBlS42.csv')

In [37]:
test_meta_df = test_db_df.merge(test_pred_df, left_on='discourse_id', right_on='discourse_id')

In [38]:
test_meta_df = create_pos_col(test_meta_df)
test_meta_df = test_meta_df.rename({
    'Ineffective': 'score_ineffective',
    'Adequate': 'score_adequate',
    'Effective': 'score_effective',
}, axis=1)

Computing `pos` column:   0%|          | 0/324 [00:00<?, ?it/s]

In [39]:
test_meta_df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay,before,after,score_ineffective,score_adequate,score_effective,pos
0,ba3f708db030,02A3E737A10F,Students shouldn't have to participate in one ...,Position,Adequate,Students shouldn't have to participate in one ...,NO,\nStudents have other things to do at home. Ma...,0.083901,0.879126,0.036973,0
1,efeb7a805421,02A3E737A10F,Students have other things to do at home,Claim,Adequate,Students shouldn't have to participate in one ...,Students shouldn't have to participate in one ...,. Many students have a lot of things to do at ...,0.031914,0.915504,0.052582,1
2,03205305e7bd,02A3E737A10F,Many students have a lot of things to do at ho...,Evidence,Effective,Students shouldn't have to participate in one ...,Students shouldn't have to participate in one ...,I disagree with the principal that every stude...,0.027744,0.464655,0.507601,2
3,e0c8f73f08a4,02A3E737A10F,I disagree with the principal that every stude...,Claim,Adequate,Students shouldn't have to participate in one ...,Students shouldn't have to participate in one ...,Some kids just want to go home and relax a bit...,0.041492,0.835423,0.123085,3
4,0d1e519f6453,02A3E737A10F,Some kids just want to go home and relax a bit...,Claim,Adequate,Students shouldn't have to participate in one ...,Students shouldn't have to participate in one ...,I think that students don't need to participat...,0.030051,0.879841,0.090108,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2664,c351cd6214d7,FAA9FC1B315B,The second reason why I disagree and I really ...,Claim,Adequate,In my personal opinion when some schools offer...,In my personal opinion when some schools offer...,Here in school we do so much activities connec...,0.066995,0.872957,0.060047,4
2665,b6cb9c9a873c,FAA9FC1B315B,Here in school we do so much activities connec...,Evidence,Adequate,In my personal opinion when some schools offer...,In my personal opinion when some schools offer...,on.\n\nMy last and final reason is that you mi...,0.125136,0.830194,0.044670,5
2666,2adbc4226dbc,FAA9FC1B315B,My last and final reason is that you might get...,Claim,Adequate,In my personal opinion when some schools offer...,In my personal opinion when some schools offer...,. That might affect your grade too which is ve...,0.031974,0.934834,0.033193,6
2667,31e16304039d,FAA9FC1B315B,That might affect your grade too which is very...,Evidence,Adequate,In my personal opinion when some schools offer...,In my personal opinion when some schools offer...,y.\n\nThose are my good reason why i don't agr...,0.088147,0.851548,0.060305,7


In [40]:
test_meta_df.to_csv(DATASET_DIR_PATH / 'test_meta.csv', index=False)