In [1]:
import os
import pandas as pd
import torch
from datasets import list_datasets, load_dataset, Dataset
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
class CFG:
    wandb = False
    apex = True #
    model = 'microsoft/deberta-v3-base'
    fast = True
    seed = 42
    n_splits = 5
    max_len = 512
    dropout = 0.1
    target_size = 3
    print_freq = 50
    min_lr = 1e-6
    scheduler = 'cosine'
    batch_size = 8
    num_workers = 0
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 3
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0 #
    num_cycles=0.5 #
    CVs = []
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    AMP = False
    freezing = True
    # after_freezed_parameters = []
    
    n_accumulate= 1

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]
    CFG.print_freq = 10

if CFG.debug_ver2:
    CFG.epochs = 1
    CFG.trn_fold = [0, 1]

In [4]:
DATA_PATH = '../data'

In [5]:
all_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

In [8]:
all_train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [9]:
test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [11]:
all_train.discourse_type.value_counts()

Evidence                12105
Claim                   11977
Position                 4024
Concluding Statement     3351
Lead                     2291
Counterclaim             1773
Rebuttal                 1244
Name: discourse_type, dtype: int64

In [12]:
all_train.discourse_effectiveness.value_counts()

Adequate       20977
Effective       9326
Ineffective     6462
Name: discourse_effectiveness, dtype: int64

In [14]:
all_train['target'] = all_train.discourse_effectiveness.map({
    'Ineffective': 0,
    'Adequate': 1,
    'Effective': 2
})

### Split train > train, val

In [16]:
train, val = train_test_split(all_train, test_size=0.2, random_state=42, stratify=all_train.discourse_type)

In [35]:
train.shape

(29412, 7)

In [37]:
val.shape

(7353, 7)

In [17]:
train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target
28397,a1606e6446ba,BAAE75D65C1D,it would be a good thing not to really use a c...,Position,Adequate,1
32844,1c06c6e0e66f,FE4703854EB4,"Personally, I think that the Electoral College...",Position,Adequate,1
15518,74a39aec56bb,0F7934223515,Helping the community can make you feel good a...,Claim,Effective,2
3923,73c62197d0f4,44D318388E8C,But that problem can be easily fixed because t...,Rebuttal,Effective,2
23104,de6209c828da,74540DF965DE,"Yes, the cars we have today are not fully reli...",Counterclaim,Adequate,1


In [18]:
val.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,target
25739,485ae68a3c91,997D5145CB33,there isnt any at proves or gives an idea that...,Evidence,Adequate,1
26797,ef7ae807c71b,A7642F5617E0,"By doing this, you and everyone else who parti...",Claim,Effective,2
18861,a88c86826e33,3BC989105A18,There are alot of people that want the elction...,Concluding Statement,Adequate,1
2596,b233aa20a9b6,2EEE167F8C62,The reasons why NASA would not want to cover o...,Claim,Ineffective,0
35369,00b24149a686,9D0934987831,a doctors opinion,Claim,Adequate,1


### Load model

In [21]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
model = AutoModelForSequenceClassification.from_pretrained(CFG.model)

Downloading:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [44]:
model = model.to(device)

### Dataset Loader

In [24]:
tokenizer.sep_token

'[SEP]'

In [32]:
train['text'] = train.apply(lambda x: x.discourse_text + tokenizer.sep_token + x.discourse_type, axis=1)

In [33]:
val['text'] = val.apply(lambda x: x.discourse_text + tokenizer.sep_token + x.discourse_type, axis=1)

In [38]:
train_dts = Dataset.from_pandas(train)

In [39]:
val_dts = Dataset.from_pandas(val)

In [40]:
train_dts

Dataset({
    features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'target', 'text', '__index_level_0__'],
    num_rows: 29412
})

In [43]:
val_dts

Dataset({
    features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'target', 'text', '__index_level_0__'],
    num_rows: 7353
})

In [50]:
def tokenize(batch):
    result = tokenizer(batch['text'], padding=True, truncation=True)
    return result

In [51]:
train_encoded = train_dts.map(tokenize)



  0%|          | 0/29412 [00:00<?, ?ex/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [52]:
val_encoded = val_dts.map(tokenize)

  0%|          | 0/7353 [00:00<?, ?ex/s]

In [53]:
train_encoded

Dataset({
    features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'target', 'text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 29412
})