In [1]:
import os
import pickle
from types import SimpleNamespace
from datetime import datetime
import random
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
training_id = datetime.now().strftime('%Y%m%d%H%M%S')

In [3]:
tqdm.pandas()

In [4]:
DATA_DIR = "../data"

In [5]:
d_train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
d_test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
d_submit = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

In [6]:
d_train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [7]:
d_test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [8]:
d_submit.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.2,0.6,0.4
1,5a88900e7dc1,3.0,6.0,1.0
2,9790d835736b,1.0,2.0,3.0
3,75ce6d68b67b,0.33,0.34,0.33
4,93578d946723,0.01,0.24,0.47


In [9]:
def set_all_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [10]:
set_all_seed()

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
print("device:", device)

device: cpu


In [13]:
CONFIG = SimpleNamespace()
CONFIG.model_name = 'microsoft/deberta-v3-base'
CONFIG.max_len = 512
CONFIG.classes = 3
CONFIG.n_folds = 5
CONFIG.lr = 1e-4
CONFIG.epochs = 5
CONFIG.batch_size = 10

### Text Preprocessing

In [14]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
class FeedbackDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.df['inputs'] = self.df.discourse_type + ' ' + tokenizer.sep_token + ' ' + d_train.discourse_text
        
        # preproceesing
        self.target_map = target_map = {'Adequate': 0, 'Effective': 1, 'Ineffective': 2}
        self.df['target'] = self.df.discourse_effectiveness.map(target_map)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.loc[index, 'inputs']
        target = self.df.loc[index, 'target']
    
        return text, target

In [16]:
d_train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [17]:
train, val = train_test_split(d_train, test_size=0.2, random_state=42, stratify=d_train.discourse_effectiveness)

In [18]:
train.reset_index(drop=True, inplace=True)

In [19]:
val.reset_index(drop=True, inplace=True)

In [20]:
train = train.head(100)

In [22]:
train.shape

(100, 5)

In [24]:
val = val.head(100)

In [25]:
val.shape

(100, 5)

In [26]:
dataset_train = FeedbackDataset(train, tokenizer)
dataset_val = FeedbackDataset(val, tokenizer)

In [27]:
def tokenizer_fn(input_):
    text, target = zip(*input_)
    text = list(text)
    text_tokenize = tokenizer(text, max_length=CONFIG.max_len, truncation=True, padding=True, return_tensors="pt")
    text_tokenize['input_ids'] = text_tokenize['input_ids'].to(device)
    text_tokenize['token_type_ids'] = text_tokenize['token_type_ids'].to(device)
    text_tokenize['attention_mask'] = text_tokenize['attention_mask'].to(device)
    
    target = torch.LongTensor(target).to(device)
    
    return text_tokenize, target

In [28]:
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(CONFIG.model_name, num_labels=CONFIG.classes)
        
    def forward(self, input_):
        out = self.model(**input_)
        
        return out

In [29]:
model = CustomModel().to(device)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [30]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=CONFIG.lr)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

In [31]:
model_history = {
    'train_loss': [],
    'val_loss': []
}

In [None]:
for epoch in range(1, 51):
    running_loss = 0
    running_loss_val = 0
    
    start = time.time()
    
    model.train()
    train_gen = DataLoader(dataset_train, batch_size=CONFIG.batch_size, collate_fn=tokenizer_fn)
    for batch_index, (x_train, y_train) in tqdm(enumerate(train_gen, 1)):
        
        optimizer.zero_grad()
        
        out = model(x_train)
        
        loss = criterion(out.logits, y_train)
        running_loss += (loss.item() - running_loss) / batch_index
        
        loss.backward()
        optimizer.step()
    
    # scheduler.step()
        
    model.eval()
    val_gen = DataLoader(dataset_val, batch_size=CONFIG.batch_size, collate_fn=tokenizer_fn)
    with torch.no_grad():
        for batch_index, (x_val, y_val) in tqdm(enumerate(val_gen, 1)):
            
            out = model(x_val)
            
            loss = criterion(out.logits, y_val)
            running_loss_val += (loss.item() - running_loss_val) / batch_index
    
    duration = time.time() - start
    
    model_history['train_loss'].append(running_loss)
    model_history['val_loss'].append(running_loss_val)
    
    current_lr = scheduler.get_last_lr()
    print(f"epoch: {epoch} | duration: {duration:.2f}s | lr: {current_lr}")
    print(f"\tTrain loss: {running_loss:.2f} | Val loss: {running_loss_val:.2f}")
    
    if epoch % 5 == 0:
        if not os.path.exists(f'../model/{training_id}'):
            os.makedirs(f'../model/{training_id}', exist_ok=True)
        PATH = f'../model/{training_id}/DeBERTa_{training_id}_train_{running_loss:.2f}val_{running_loss_val:.2f}.pth'
        torch.save(model.state_dict(), PATH)
        
pickle.dump(model_history, open(f'../model/{training_id}/model_history_{training_id}.pkl', 'wb'))

0it [00:00, ?it/s]