In [2]:
import numpy as np
import pandas as pd
import os
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch
import random
from torch.cuda.amp import autocast, GradScaler
import time
from transformers import AutoModel, AutoConfig, get_linear_schedule_with_warmup, AutoTokenizer
from sklearn.metrics import log_loss
import torch.utils.checkpoint
import logging
from contextlib import contextmanager
import sys
import gc


In [3]:
ex = '001'
TRAIN_PATH = '../../data/train_folds.csv'
TEST_PATH = '../../data/test.csv'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
batch_size = 32
max_len = 512

deberta_v3_large_MODEL_PATH = './tokenizer'
deberta_v3_large_tokenizer = AutoTokenizer.from_pretrained(deberta_v3_large_MODEL_PATH)


In [5]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield



In [12]:
class FeedbackDataset(Dataset):
    def __init__(self, tokenizer, data_dict, max_len):
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.texts = data_dict['text'].values
        self.labels = data_dict['label'].values
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        SEP = self.tokenizer.sep_token
        text = self.texts[index]
        text = text.replace('[SEP]', SEP)
        tokenized = self.tokenizer(text=self.texts[index],
                                   add_special_tokens=True,
                                   max_length=self.max_len,
                                   padding='max_length',
                                   truncation=True,
                                   return_tensors='pt')
        target = np.zeros(3, dtype=np.int64)
        target[self.labels[index]] = 1
        return tokenized['input_ids'].squeeze(), tokenized['attention_mask'].squeeze(), target

In [7]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class FeedbackModel(nn.Module):

    def __init__(self,model_name=None, config_path=None, pretrained=True, num_labels=3):
        super(FeedbackModel,self).__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(model_name, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.drop_out = nn.Dropout(0.1)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.pooler = MeanPooling()
        self.output = nn.Linear(self.config.hidden_size,num_labels)
        
        self.model.embeddings.requires_grad_(False)
        self.model.encoder.layer[:2].requires_grad_(False)
#         self.model.gradient_checkpointing_enable()
        if 'deberta-v2-xxlarge' in model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:24].requires_grad_(False) # 冻结24/48
        if 'deberta-v2-xlarge' in model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:12].requires_grad_(False) # 冻结12/24
        if 'funnel-transformer-xlarge' in model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.blocks[:1].requires_grad_(False) # 冻结1/3

    def forward(self, input_ids, attention_mask):
        if 'gpt' in self.model.name_or_path:
            emb = self.model(input_ids)[0]
        else:
            emb = self.model(input_ids,attention_mask)[0]

        emb = self.pooler(emb, attention_mask)
        preds1 = self.output(self.dropout1(emb))
        preds2 = self.output(self.dropout2(emb))
        preds3 = self.output(self.dropout3(emb))
        preds4 = self.output(self.dropout4(emb))
        preds5 = self.output(self.dropout5(emb))

        preds = (preds1 + preds2 + preds3 + preds4 + preds5) / 5
#         preds = torch.softmax(preds, dim=-1)
        
        return preds

In [8]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

def add_separators(context):
    if len(context) > 1:
        context = '[SEP]' + context
    return context

def get_essay(essay_id, is_train=True):
    parent_path = INPUT_DIR + 'train' if is_train else INPUT_DIR + 'test'
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text


In [9]:
test = pd.read_csv(TEST_PATH)
train = pd.read_csv(TRAIN_PATH)
INPUT_DIR = '../../data/'
test['essay_text']  = test['essay_id'].apply(lambda x: get_essay(x, is_train=False))
test['discourse_text'] = test['discourse_text'].apply(lambda x : resolve_encodings_and_normalize(x))
test['essay_text'] = test['essay_text'].apply(lambda x : resolve_encodings_and_normalize(x))
test['text'] = test['discourse_type'] + ' ' + test['discourse_text'] + '[SEP]' + test['essay_text']
fold_array = train['fold'].values

In [10]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)

In [13]:
if len(test) > 1:
    with timer("deberta_v3_large"):
        oof = np.zeros([len(train), 3])
#         # dataset
#         valid_datagen = FeedbackDataset(
#             deberta_v3_large_tokenizer, 
#             test, 
#             max_len
#         )

#         # loader
#         valid_generator = DataLoader(
#             dataset=valid_datagen,
#             batch_size=batch_size,
#             shuffle=False,
#             num_workers=2,
#             pin_memory=True,
#             drop_last=False
#         )
#         folds = [0,1,2,3,4]
        for fold in range(5):
            val_losses = AverageMeter()
            valid_df = train.loc[train['fold'] == fold].reset_index(drop=True)
            valid_datagen = FeedbackDataset(
                deberta_v3_large_tokenizer, 
                valid_df, 
                max_len
            )
            valid_generator = DataLoader(
                dataset=valid_datagen,
                batch_size=batch_size,
                shuffle=False,
                num_workers=4,
                pin_memory=True,
                drop_last=False
            )

            folds = [0,1,2,3,4]
            # model
            model = FeedbackModel(model_name = 'microsoft/deberta-v3-large', config_path = './config.pth', pretrained=False)
            model.load_state_dict(torch.load(f"../output/ex/ex001/ex001_model/ex001_{fold}.pth"))
            model.to(device)
            model.eval()
            preds = np.ndarray([0,3])

            for step, (batch_input_ids, batch_attention_mask, batch_target) in enumerate(valid_generator):
                batch_input_ids = batch_input_ids.to(device)
                batch_attention_mask = batch_attention_mask.to(device)
                batch_target = torch.from_numpy(np.array(batch_target)).float().to(device)
                with torch.no_grad():
                    logits = model(batch_input_ids, batch_attention_mask)
                    loss = nn.CrossEntropyLoss()(logits, batch_target)
                
                val_losses.update(loss.item(), logits.size(0))
                x = logits.to('cpu').numpy()
                logits = softmax(logits.to('cpu').numpy())
                preds = np.concatenate(
                        [preds, logits], axis=0
                    ) 
                if step % 50 == 0 or step == (len(valid_generator)-1):
                    print(
                        'EVAL: [{0}/{1}] '
                        'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                        .format(
                            step, 
                            len(valid_generator),
                            loss=val_losses,
                        )
                    )
            oof[fold_array == fold] = preds
            del model,valid_datagen, valid_generator
            gc.collect()

        # y_test_deberta_v3_large = np.mean(y_test_deberta_v3_large,axis=0)
        # del valid_datagen, valid_generator
        # gc.collect()

del deberta_v3_large_tokenizer
gc.collect()

[[4.83198725e-02 9.29174185e-01 2.25058720e-02]
 [6.14187717e-02 7.20364153e-01 2.18217090e-01]
 [6.26790002e-02 8.67462754e-01 6.98581859e-02]
 [7.12173358e-02 8.66039574e-01 6.27431720e-02]
 [1.81827784e-01 8.05118203e-01 1.30539890e-02]
 [4.50502396e-01 5.47653735e-01 1.84386561e-03]
 [6.07365966e-01 3.91420871e-01 1.21316651e-03]
 [6.23846769e-01 3.75299752e-01 8.53457663e-04]
 [4.40039128e-01 5.54079533e-01 5.88134862e-03]
 [2.50553578e-01 7.44796872e-01 4.64951666e-03]]
EVAL: [0/230] Loss: 0.3821(0.3821) 
[[8.67516920e-03 7.08160460e-01 2.83164412e-01]
 [4.35182918e-03 3.47264975e-01 6.48383200e-01]
 [4.48539900e-03 3.34119648e-01 6.61394894e-01]
 [3.27101024e-03 2.62397826e-01 7.34331191e-01]
 [2.99349725e-02 6.28071070e-01 3.41993868e-01]
 [6.95945811e-04 9.02013928e-02 9.09102619e-01]
 [4.26788087e-04 4.77327220e-02 9.51840460e-01]
 [1.41075263e-02 4.66786027e-01 5.19106448e-01]
 [1.77971460e-02 5.73909402e-01 4.08293426e-01]
 [9.19058162e-04 8.61116350e-02 9.12969351e-01]]
EV

0

In [97]:
if len(test) > 1:
    with timer("deberta_v3_large"):
        y_test_deberta_v3_large = []
        # dataset
        valid_datagen = FeedbackDataset(
            deberta_v3_large_tokenizer, 
            test, 
            max_len
        )

        # loader
        valid_generator = DataLoader(
            dataset=valid_datagen,
            batch_size=batch_size,
            shuffle=False,
            num_workers=4,
            pin_memory=True,
            drop_last=False
        )

        for fold in range(5):

            # model
            model = FeedbackModel(model_name = 'microsoft/deberta-v3-large', config_path = './config.pth', pretrained=False)
            model.load_state_dict(torch.load(f"../output/ex/ex001/ex001_model/ex001_{fold}.pth"))
            model.to(device)
            model.eval()
            test_preds = np.ndarray([0,3])

            for step, (batch_input_ids, batch_attention_mask) in enumerate(valid_generator):
                batch_input_ids = batch_input_ids.to(device)
                batch_attention_mask = batch_attention_mask.to(device)

                with torch.no_grad():
                    logits = model(batch_input_ids, batch_attention_mask)
                logits = softmax(logits.to('cpu').numpy())
                test_preds = np.concatenate(
                        [test_preds, logits], axis=0
                    )    
            y_test_deberta_v3_large.append(test_preds)
            del model
            gc.collect()

        y_test_deberta_v3_large = np.mean(y_test_deberta_v3_large,axis=0)
        del valid_datagen, valid_generator
        gc.collect()

del deberta_v3_large_tokenizer
gc.collect()

0

In [98]:
y_test_deberta_v3_large

array([[0.01500432, 0.50554253, 0.47945313],
       [0.0358187 , 0.87410471, 0.09007663],
       [0.03873662, 0.81119816, 0.15006521],
       [0.0523075 , 0.8237339 , 0.1239586 ],
       [0.04561032, 0.80198406, 0.15240565],
       [0.02685003, 0.52322471, 0.44992526],
       [0.01977329, 0.43787841, 0.54234828],
       [0.03587363, 0.77802705, 0.18609938],
       [0.02197314, 0.42942826, 0.5485986 ],
       [0.03117581, 0.71913257, 0.24969162]])

In [15]:
# correct solution:
def get_target(x):
    target = np.array([0,0,0])
    target[x] = 1
    return target
y = train['label'].values
print(y.shape)
val_score = log_loss(y, oof)
print(f'oof_score:{val_score}')
np.save('../output/ex/ex001/ex001_oof.npy', oof)

(36765,)
oof_score:0.6341769603843165


In [1]:
oof

NameError: name 'oof' is not defined