In [2]:
import os
import random
import warnings

import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from joblib import Parallel, delayed
from sklearn import metrics
from tqdm import tqdm

from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
LABEL_MAPPING = {"Ineffective": 0, "Adequate": 1, "Effective": 2}

In [3]:
def prepare_data(indir, tokenizer, df, max_len):
    training_samples = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        id_ = row["essay_id"]
        discourse_text = row["discourse_text"]
        discourse_type = row["discourse_type"]

        filename = os.path.join(indir, id_ + ".txt")

        with open(filename, "r") as f:
            text = f.read()

        encoding = tokenizer.encode_plus(
            discourse_type + " " + discourse_text,
            text,
            truncation='only_second',
            max_length=max_len
        )

        sample = {
            "discourse_id": row["discourse_id"],
            "fold": row["kfold"],
            **encoding,
        }

        if "discourse_effectiveness" in row:
            label = row["discourse_effectiveness"]
            sample["label"] = LABEL_MAPPING[label]

        training_samples.append(sample)
    return training_samples
def prepare_data_mp(indir, tokenizer, df, max_len, j=8):
    training_samples = []

    df_splits = np.array_split(df, j)

    results = Parallel(n_jobs=j, backend="multiprocessing")(
        delayed(prepare_data)(indir, tokenizer, df, max_len) for df in df_splits
    )
    for result in results:
        training_samples.extend(result)

    return training_samples

In [4]:
df = pd.read_csv('../data/train_folds.csv')
df

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,kfold
0,267e18642581,28D8A6A8A99D,For example if there is a tie in the Electoral...,Evidence,Effective,3
1,96e13c3a2475,CDBECBAD7FCE,I need to say it is very cool.,Claim,Ineffective,4
2,63025df09e33,43FDEB1E411C,Using this techology more kids will get good g...,Concluding Statement,Adequate,4
3,3b1a87c5cf9a,722F0DECA31C,"The system we have now works,so what will be t...",Claim,Adequate,4
4,4cb9af0970b2,5934838B26AD,Most of the times the choices someone will mak...,Evidence,Effective,2
...,...,...,...,...,...,...
36760,82f7539c3b4f,CFB5BFF82CDE,it was a great idea to join the program becaus...,Claim,Adequate,2
36761,3357ad5ace5a,550B399B17B3,Lucky for us we live in a pretty good town com...,Evidence,Adequate,3
36762,42ca0aee6d7d,7F63036D0C59,Most software for online schooling costs very ...,Evidence,Adequate,3
36763,50be1d88a927,AA4620DAA6CF,"But as i think abut it , the president is not ...",Claim,Adequate,1


In [5]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
samples = prepare_data('../data/feedback-prize-effectiveness/train', tokenizer, df, max_len=1024)
len(samples), list(samples[0].keys())

100%|████████████████████████████████████| 36765/36765 [00:43<00:00, 839.12it/s]


(36765,
 ['discourse_id',
  'fold',
  'input_ids',
  'token_type_ids',
  'attention_mask',
  'label'])

In [10]:
lens = [len(s['input_ids']) for s in samples]

In [11]:
np.mean(lens), np.median(lens), np.max(lens), np.quantile(lens, 0.9), np.quantile(lens, 0.95)

(568.0345709234326, 520.0, 1024, 982.0, 1024.0)

In [12]:
tokenizer.decode(samples[0]['input_ids'])

'[CLS] Evidence For example if there is a tie in the Electoral vote the election would be handed to the House of Representatives, where state delegations vote on the presedent. This wouldnt be fair because since each state casts only one vote, then the single representativ from Wyoming would only represent 500,000 voters. However the single representative from California would represent 35 million voters. That hardly seems fair.[SEP] Dear State Senator, The Electoral College is a process where voters select electors who then vote for the President of the United State. But is it really that reliable? Is it better than an election decided by that of popular vote. The election process should be changed to one decided by popular vote instead of using the process of the Electoral College. First of all source two states that voters can sometimes get confused on the candidate that they actually voted for since they\'re not direclty voting for the President. Another flaw of the Electoral Colle

In [15]:
samples[0]['discourse_id']

'267e18642581'

In [16]:
df[df['discourse_id']==samples[0]['discourse_id']]

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,kfold
0,267e18642581,28D8A6A8A99D,For example if there is a tie in the Electoral...,Evidence,Effective,3


In [17]:
samples[0]['label']

2