In [None]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
# https://github.com/huggingface/transformers/issues/9919
from torch.utils.checkpoint import checkpoint

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import pickle # For roberta test

In [None]:
INPUT_DIR = '../input/feedback-prize-effectiveness/'

<br>
<h1 style = "font-size:60px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">Deberta V3 Base</h1>
<br>

# CFG

In [None]:
class CFG:
    wandb = False
    apex = True #
    model = '../input/deberta-v3-base/deberta-v3-base'
    fast = True
    seed = 42
    n_splits = 5
    max_len = 512
    dropout = 0.1
    target_size = 3
    print_freq = 50
    min_lr = 1e-6
    scheduler = 'cosine'
    batch_size = 8
    num_workers = 0
    lr = 3e-5
    weigth_decay = 0.01
    epochs = 3
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True 
    num_warmup_steps = 0 #
    num_cycles=0.5 #
    CVs = []
    debug = False
    debug_ver2 = False
    gradient_checkpointing = True
    AMP = False
    freezing = True
    # after_freezed_parameters = []
    
    n_accumulate= 1
    

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]
    CFG.print_freq = 10

if CFG.debug_ver2:
    CFG.epochs = 1
    CFG.trn_fold = [0, 1]

# Helper Function

In [None]:
# Loss Func
def criterion(outputs, labels):
    return nn.CrossEntropyLoss()(outputs, labels)

In [None]:
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

In [None]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

In [None]:
# append to train/test.csv
def get_essay(essay_id, is_train=True):
    parent_path = INPUT_DIR + 'train' if is_train else INPUT_DIR + 'test' 
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

# Preprocess & Tokenizer

In [None]:
# Testing Data
test = pd.read_csv(INPUT_DIR + 'test.csv')
test['essay_text'] = test['essay_id'].apply(lambda x: get_essay(x, is_train=False))
test.head()

In [None]:
if CFG.fast:
    tokenizer = AutoTokenizer.from_pretrained(CFG.model, use_fast=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer

# Normalize

In [None]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [None]:
test['discourse_text'] = test['discourse_text'].apply(lambda x : resolve_encodings_and_normalize(x))
test['essay_text'] = test['essay_text'].apply(lambda x : resolve_encodings_and_normalize(x))
# Tokenize the test data
test['text'] = test['discourse_type'] + ' '+ test['discourse_text'] + '[SEP]' + test['essay_text']
test['label'] = np.nan

# Dataset

In [None]:
# Testing Datasets
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.cfg.tokenizer.encode_plus(
                        self.text[item],
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.cfg.max_len
                    )
        samples = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }

        if 'token_type_ids' in inputs:
            samples['token_type_ids'] = inputs['token_type_ids']
        
        return samples

# Dynamic padding

In [None]:
# Dynamic Padding (Collate)
# collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer)
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

# Model

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9) #
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        # Header (fast or normal)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Gradient_checkpointing
        if CFG.gradient_checkpointing:
            (self.model).gradient_checkpointing_enable()
        
        # Freezing
        if CFG.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            CFG.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())
        
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=CFG.dropout)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.target_size)
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids, 
                         attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

# Prediction

In [None]:
# predict the test value result
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for data in tk0:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        with torch.no_grad():
            y_preds = model(ids, mask)
        y_preds = softmax(y_preds.to('cpu').numpy())
        # y_preds = y_preds.to('cpu').numpy()
        preds.append(y_preds)
    predictions = np.concatenate(preds)
    return predictions

In [None]:
testDataset = TestDataset(CFG, test)
test_loader = DataLoader(testDataset,
                              batch_size = CFG.batch_size,
                              shuffle=False,
                              collate_fn = Collate(CFG.tokenizer, isTrain=False),
                              num_workers = CFG.num_workers,
                              pin_memory = True,
                              drop_last=False)
deberta_predictions = []
for i in CFG.trn_fold:
    model = FeedBackModel(CFG.model)
    model.load_state_dict(torch.load('../input/dbv3basemodels202279/models-deberta-v3-base-deberta-v3-base_fold' + str(i) +'_best.pth'))
    prediction = inference_fn(test_loader, model, device)
    deberta_predictions.append(prediction)
    torch.cuda.empty_cache()
    gc.collect()

# Save the predictions

In [None]:
deb_ineffective = []
deb_effective = []
deb_adequate = []

for x in deberta_predictions:
    deb_ineffective.append(x[:, 0])
    deb_adequate.append(x[:, 1])
    deb_effective.append(x[:, 2])
# list -> dataframe
deb_ineffective = pd.DataFrame(deb_ineffective).T
deb_adequate = pd.DataFrame(deb_adequate).T
deb_effective = pd.DataFrame(deb_effective).T

<br>
<h1 style = "font-size:60px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">Roberta Base + Deberta-Large</h1>
<br>

reference:https://www.kaggle.com/code/renokan/fork-ensemble-deberta-roberta/notebook

# Pre process

In [None]:
import gc
import os
import pickle
import glob

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

import numpy as np
import pandas as pd

from tqdm import tqdm

import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.nn import Parameter
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer, AutoConfig

import warnings
warnings.simplefilter('ignore')

In [None]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
    
    return text


def fetch_essay(essay_id: str, txt_dir: str):
    essay_path = os.path.join(COMP_DIR + txt_dir, essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    
    return essay_text


def prepare_input(cfg, text, text_2=None):
    inputs = cfg.tokenizer(text, text_2,
                           padding="max_length",
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           truncation=True)

    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
        
    return inputs


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        with torch.no_grad():
            output = model(inputs)
        
        preds.append(F.softmax(output).to('cpu').numpy())

    return np.concatenate(preds)  


def show_gradient(df, n_row=None):
    if not n_row:
        n_row = 5

    return df.head(n_row) \
                .assign(all_mean=lambda x: x.mean(axis=1)) \
                    .style.background_gradient(cmap=cm, axis=1)

In [None]:
pd.set_option('display.precision', 4)
cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

N_ROW = 10

COMP_DIR = "../input/feedback-prize-effectiveness/"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
test_path = COMP_DIR + "test.csv"
submission_path = COMP_DIR + "sample_submission.csv"

test_origin = pd.read_csv(test_path)
submission_origin = pd.read_csv(submission_path)

In [None]:
test_origin.head()

In [None]:
data_path = "../input/feedback-prize-effectiveness/train.csv"
cols_list = ['essay_id', 'discourse_text']
idxs_list = [49, 80, 945, 947, 1870]

temp = pd.read_csv(data_path, usecols=cols_list).loc[idxs_list, :]
temp

In [None]:
temp['discourse_text_UPD'] = temp['discourse_text'].apply(resolve_encodings_and_normalize)

temp['essay_text'] = temp['essay_id'].transform(fetch_essay, txt_dir='train')
temp['essay_text_UPD'] = temp['essay_text'].apply(resolve_encodings_and_normalize)

temp

In [None]:
for n, row in enumerate(temp.iterrows()):
    indx, data = row
    disc_text = data.discourse_text
    disc_text_upd = data.discourse_text_UPD

    print(f'\nN{n} === index: {indx} ===')
    print(f'\n>>> origin text:')
    print(repr(disc_text))
    print(f'\n>>> updated text:')
    print(repr(disc_text_upd))

# Deberta Large

In [None]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].values

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):    
        text = self.text[item]
        inputs = prepare_input(self.cfg, text)
        
        return inputs

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2, 
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
        
        # self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        
        self.output = nn.Sequential(
            nn.Linear(self.config.hidden_size, 3)  # self.cfg.target_size
        )
                
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        sequence_output = self.model(**inputs)[0][:, 0, :]

        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        return logits

In [None]:
class CFG:
    path = "../input/feedback-deberta-large-051/"
    config_path = path+'config.pth'
    model = "microsoft/deberta-large"
    num_workers = 2
    batch_size = 32
    max_len = 512
    seed = 42
    n_fold = 4
    # trn_fold = [0, 1, 2, 3]
    # fc_dropout = 0.2
    # target_size = 3
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path + 'tokenizer')

In [None]:
df = test_origin.copy()
SEP = CFG.tokenizer.sep_token

df['discourse_text'] = df['discourse_text'].apply(resolve_encodings_and_normalize)
df['essay_text'] = df['essay_id'].transform(fetch_essay, txt_dir='test')
df['essay_text'] = df['essay_text'].apply(resolve_encodings_and_normalize)
df['text'] = df['discourse_type'] + ' ' + df['discourse_text'] + SEP + df['essay_text']

df.head()

In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [None]:
deberta_large_predictions = []

for fold in range(CFG.n_fold):
    model = CustomModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, DEVICE)
    
    deberta_large_predictions.append(prediction)
    
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()

In [None]:
deb_large_ineffective = []
deb_large_effective = []
deb_large_adequate = []

for x in deberta_large_predictions:
    deb_large_ineffective.append(x[:, 0])
    deb_large_adequate.append(x[:, 1])
    deb_large_effective.append(x[:, 2])
# list -> dataframe
deb_large_ineffective = pd.DataFrame(deb_large_ineffective).T
deb_large_adequate = pd.DataFrame(deb_large_adequate).T
deb_large_effective = pd.DataFrame(deb_large_effective).T

# Roberta

In [None]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.discourse = df['discourse'].values
        self.essay = df['essay'].values
        
    def __len__(self):
        return len(self.discourse)
    
    def __getitem__(self, item):
        discourse = self.discourse[item]
        essay = self.essay[item]
        
        inputs = prepare_input(self.cfg, discourse, essay)
        
        return inputs
        
class FeedBackModel(nn.Module):
    def __init__(self, model_path):
        super(FeedBackModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.linear = nn.Linear(768, 3)

    def forward(self, inputs):
        last_hidden_states = self.model(**inputs)[0][:, 0, :]
        outputs = self.linear(last_hidden_states)
        
        return outputs

In [None]:
model_list = pickle.load(
    open("../input/feedback-roberta-ep1/roberta_modellist_ep2.pkl", "rb")
)

class CFG:
    path = "../input/roberta-base/"
    n_fold = 5
    batch = 16
    max_len = 512
    num_workers = 2
    
CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path)

In [None]:
df = test_origin.copy()

txt_sep = " "
df['discourse'] = df['discourse_type'].str.lower().str.strip() + txt_sep \
                + df['discourse_text'].str.lower().str.strip()

df['essay'] = df['essay_id'].transform(fetch_essay, txt_dir='test').str.lower().str.strip()
df.head()

In [None]:
test_dataset = TestDataset(CFG, df)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch,
                         shuffle=False, num_workers=CFG.num_workers,
                         pin_memory=True, drop_last=False)

In [None]:
roberta_predicts = []
for i in range(CFG.n_fold):
    model = model_list[i]
    
    prediction = inference_fn(test_loader, model, DEVICE)
    roberta_predicts.append(prediction)
    
    del model, prediction
    torch.cuda.empty_cache()    
    gc.collect()
    
del model_list
gc.collect()

In [None]:
rob_ineffective = []
rob_effective = []
rob_adequate = []

for x in roberta_predicts:
    rob_ineffective.append(x[:, 0])
    rob_adequate.append(x[:, 1])
    rob_effective.append(x[:, 2])

# list -> dataframe
rob_ineffective = pd.DataFrame(rob_ineffective).T
rob_adequate = pd.DataFrame(rob_adequate).T
rob_effective = pd.DataFrame(rob_effective).T

# Submission

In [None]:
# Calculate the mean prediction probabilities of each folds
submission = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')

level_names = ['deberta', 'deberta_large', 'roberta']

ineffective_ = pd.concat(
    [deb_ineffective, deb_large_ineffective, rob_ineffective],
    keys=level_names, axis=1
)

adequate_ = pd.concat(
    [deb_adequate,deb_large_adequate, rob_adequate],
    keys=level_names, axis=1
)

effective_ = pd.concat(
    [deb_effective, deb_large_effective, rob_effective],
    keys=level_names, axis=1
)

In [None]:
show_gradient(
    ineffective_,
    N_ROW
)
show_gradient(
    adequate_,
    N_ROW
)
show_gradient(
    effective_,
    N_ROW
)

In [None]:
w_ = [0.25, 0.65, 0.1]  # ['deberta_base', 'deberta_large', 'roberta']
d_ = [('Ineffective', ineffective_),
      ('Adequate', adequate_),
      ('Effective', effective_)]

for x in d_:
    col_name, df = x
    submission[col_name] = pd.DataFrame(
        {col: df[col].mean(axis=1) for col in level_names}
    ).mul(w_).sum(axis=1)    

submission.head(N_ROW)

In [None]:
submission.to_csv('submission.csv',index=False)