In [1]:
import os
from types import SimpleNamespace
import random
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
DATA_DIR = "../data"

In [4]:
d_train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
d_test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
d_submit = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

In [5]:
d_train.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [6]:
d_test.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [7]:
d_submit.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.2,0.6,0.4
1,5a88900e7dc1,3.0,6.0,1.0
2,9790d835736b,1.0,2.0,3.0
3,75ce6d68b67b,0.33,0.34,0.33
4,93578d946723,0.01,0.24,0.47


In [8]:
def set_all_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [9]:
set_all_seed()

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
device

'cpu'

In [12]:
CONFIG = SimpleNamespace()
CONFIG.model_name = 'microsoft/deberta-v3-base'
CONFIG.max_len = 512
CONFIG.classes = 3
CONFIG.n_folds = 5
CONFIG.lr = 1e-3
CONFIG.epochs = 5
CONFIG.batch_size = 16

### Text Preprocessing

In [13]:
d_train.discourse_effectiveness.value_counts()

Adequate       20977
Effective       9326
Ineffective     6462
Name: discourse_effectiveness, dtype: int64

In [14]:
target_map = {'Adequate': 0, 'Effective': 1, 'Ineffective': 2}

In [15]:
d_train['target'] = d_train.discourse_effectiveness.map(target_map)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG.model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
d_train['inputs'] = d_train.discourse_type + ' ' + tokenizer.sep_token + ' ' + d_train.discourse_text

In [18]:
X = tokenizer(d_train.inputs.tolist(), max_length=CONFIG.max_len, truncation=True, padding=True)

In [28]:
class FeedbackDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.text = self.df.inputs.tolist()
        self.target = self.df.target.tolist()
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        target = self.target[index]
        
        return text, target

In [30]:
dataset_train = FeedbackDataset(d_train)

In [37]:
x, y = zip(*[(1,2), (3,4)])

In [57]:
def tokenizer_fn(input_):
    text, target = zip(*input_)
    text = list(text)
    text_tokenize = tokenizer(text, max_length=CONFIG.max_len, truncation=True, padding=True, return_tensors="pt")
    
    target = torch.LongTensor(target)
    
    return text_tokenize, target

In [58]:
train_gen = DataLoader(dataset_train, batch_size=2, collate_fn=tokenizer_fn)

In [59]:
for text, target in train_gen:
    break

In [61]:
model = AutoModelForSequenceClassification.from_pretrained(CONFIG.model_name, num_labels=CONFIG.classes)

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

In [62]:
model.save_pretrained('../model/')

In [64]:
out = model(**text)

In [65]:
out

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0727,  0.0227,  0.0079],
        [-0.0741,  0.0169,  0.0119]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
class CustomModel(nn.Module):
    def __init__(self):
        AutoModelForSequenceClassification.from_pretrained(CONFIG.model_name, num_)
    
    def forward():
        pass

In [None]:
model = CustomModel()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=COFIG.lr)