In [46]:
import os
import random
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from joblib import Parallel, delayed
from sklearn import metrics
from tqdm import tqdm

from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

In [43]:
def prepare_data(indir, tokenizer, df, max_len):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        idx = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        filename = os.path.join(indir, idx + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        encoding = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            truncation=True,
            max_length=max_len
        )

        sample = {
            "discourse_id": row["discourse_id"],
            "fold": row["kfold"],
            **encoding,
        }

        if "discourse_effectiveness" in row:
            label = row["discourse_effectiveness"]
            sample["label"] = LABEL_MAPPING[label]

        training_samples.append(sample)
    return training_samples
def prepare_data_mp(indir, tokenizer, df, max_len, j=8):
    training_samples = []

    df_splits = np.array_split(df, j)

    results = Parallel(n_jobs=j, backend="multiprocessing")(
        delayed(prepare_data)(indir, tokenizer, df, max_len) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [44]:
df = pd.read_csv('../data/train_folds.csv')
df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,kfold
0,267e18642581,28D8A6A8A99D,For example if there is a tie in the Electoral...,Evidence,Effective,3
1,96e13c3a2475,CDBECBAD7FCE,I need to say it is very cool.,Claim,Ineffective,4
2,63025df09e33,43FDEB1E411C,Using this techology more kids will get good g...,Concluding Statement,Adequate,4
3,3b1a87c5cf9a,722F0DECA31C,"The system we have now works,so what will be t...",Claim,Adequate,4
4,4cb9af0970b2,5934838B26AD,Most of the times the choices someone will mak...,Evidence,Effective,2
...,...,...,...,...,...,...
36760,82f7539c3b4f,CFB5BFF82CDE,it was a great idea to join the program becaus...,Claim,Adequate,2
36761,3357ad5ace5a,550B399B17B3,Lucky for us we live in a pretty good town com...,Evidence,Adequate,3
36762,42ca0aee6d7d,7F63036D0C59,Most software for online schooling costs very ...,Evidence,Adequate,3
36763,50be1d88a927,AA4620DAA6CF,"But as i think abut it , the president is not ...",Claim,Adequate,1


In [34]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [47]:
samples = prepare_data_mp('../data/feedback-prize-effectiveness/train', tokenizer, df, max_len=100000, j=8)
len(samples), list(samples[0].keys())

Process SpawnPoolWorker-49:
Traceback (most recent call last):
  File "/Users/yuchenwang/miniconda3/envs/kaggle/lib/python3.10/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/yuchenwang/miniconda3/envs/kaggle/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/yuchenwang/miniconda3/envs/kaggle/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/yuchenwang/miniconda3/envs/kaggle/lib/python3.10/site-packages/joblib/pool.py", line 147, in get
    return recv()
  File "/Users/yuchenwang/miniconda3/envs/kaggle/lib/python3.10/multiprocessing/connection.py", line 256, in recv
    return _ForkingPickler.loads(buf.getbuffer())
AttributeError: Can't get attribute 'prepare_data' on <module '__main__' (built-in)>
Process SpawnPoolWorker-50:
Traceback (most recent call last):
  File "/Users/yuchenwang/miniconda3/envs/kaggle/lib/python3.10/multiprocessing/

KeyboardInterrupt: 

In [37]:
lens = [len(s['input_ids']) for s in training_samples]

In [41]:
np.mean(lens), np.median(lens), np.max(lens), np.quantile(lens, 0.9), np.quantile(lens, 0.95)

(580.8354413164694, 520.0, 2085, 982.0, 1116.0)