In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/debertav3base/rust_model.ot
/kaggle/input/debertav3base/spm.model
/kaggle/input/debertav3base/config.json
/kaggle/input/debertav3base/tf_model.h5
/kaggle/input/debertav3base/tokenizer_config.json
/kaggle/input/debertav3base/pytorch_model.bin
/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv
/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv
/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv


#### Imports

In [2]:
import numpy as np 
import pandas as pd
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold



In [3]:
import random
import os
import warnings
import logging
import shutil
from tqdm import tqdm
from datasets import disable_progress_bar
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [4]:
class CFG:
    model_name="debertav3base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=8
    random_seed=42
    save_steps=500
    max_length=512

In [5]:
# SEED 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(CFG.random_seed)

## Load Data

In [6]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [7]:
train = pd.merge(prompts_train,summaries_train,on='prompt_id')
test=pd.merge(prompts_test,summaries_test,on='prompt_id')

In [8]:
train.head(2)

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058


In [9]:
test.head(2)

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text
0,abc123,Summarize...,Example Title 1,Heading\nText...,000000ffffff,Example text 1
1,abc123,Summarize...,Example Title 1,Heading\nText...,222222cccccc,Example text 3


#### Split train data using GroupKFolds on prompt_id
Since test dataset will have new prompts, hence task becomes to train model to perform well on new/unseen prompts - https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/discussion/425409#2357563

In [10]:
gkf = GroupKFold(n_splits=CFG.n_splits) # Since 4 prompts in training set

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

In [21]:
train.groupby("fold").count()

Unnamed: 0_level_0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2057,2057,2057,2057,2057,2057,2057,2057
1.0,2009,2009,2009,2009,2009,2009,2009,2009
2.0,1996,1996,1996,1996,1996,1996,1996,1996
3.0,1103,1103,1103,1103,1103,1103,1103,1103


#### Prepare huggingface dataset
Ref - https://huggingface.co/docs/datasets/v2.14.5/en/tabular_load#pandas-dataframes

In [22]:
target_cols = ['content','wording']
text_cols = ['text']

In [54]:
train_dataset = Dataset.from_pandas(train[text_cols + target_cols + ['fold']])
test_dataset = Dataset.from_pandas(test[text_cols])

In [55]:
train_dataset, test_dataset

(Dataset({
     features: ['text', 'content', 'wording', 'fold'],
     num_rows: 7165
 }),
 Dataset({
     features: ['text'],
     num_rows: 4
 }))

## Define Model and Metrics

#### Tokenizer

In [25]:
tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/{CFG.model_name}')

In [26]:
tokenizer.encode_plus(train_dataset[0]['text'])

{'input_ids': [1, 376, 3036, 265, 299, 1949, 8948, 269, 272, 278, 403, 282, 6128, 277, 266, 1739, 741, 260, 1811, 3036, 265, 299, 1949, 8948, 269, 272, 278, 403, 364, 286, 311, 872, 889, 260, 279, 437, 3036, 265, 299, 1949, 8948, 269, 272, 278, 403, 286, 266, 1664, 3676, 4278, 263, 299, 3680, 21419, 270, 462, 397, 263, 966, 260, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [56]:
train_dataset[0]

{'text': '1 element of an ideal tragedy is that it should be arranged on a complex plan.  Another element of an ideal tragedy is that it should only have one main issue. The last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad.',
 'content': -0.210613934166593,
 'wording': -0.471414826967448,
 'fold': 0.0}

In [57]:
## Figure out generating labels columns for batch of thousand
def generate_tokens(examples: pd.DataFrame,mode='train'):
    encodings = tokenizer(examples['text'],truncation=True,max_length=CFG.max_length,return_tensors='np')
    if mode == 'test':
        return encodings
    labels = np.column_stack((examples['content'],examples['wording']))
    return {**encodings, "labels": labels, "fold": examples['fold']}

tokenized_train_dataset = train_dataset.map(generate_tokens,batched=True)

In [58]:
tokenized_train_dataset

Dataset({
    features: ['text', 'content', 'wording', 'fold', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7165
})

In [59]:
tokenized_test_dataset = test_dataset.map(lambda x: generate_tokens(x,'test'),batched=True)

In [60]:
tokenized_test_dataset

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4
})

#### Config

In [32]:
config = AutoConfig.from_pretrained(f'/kaggle/input/{CFG.model_name}')

In [33]:
config

DebertaV2Config {
  "_name_or_path": "/kaggle/input/debertav3base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.33.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [35]:
config.update({
    "num_labels": 2,
    "problem_type": 'regression',
    "hidden_dropout_prob": CFG.hidden_dropout_prob,
    "attention_probs_dropout_prob": CFG.attention_probs_dropout_prob
})

In [36]:
config

DebertaV2Config {
  "_name_or_path": "/kaggle/input/debertav3base",
  "attention_probs_dropout_prob": 0.007,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.007,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "problem_type": "regression",
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.33.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

#### Model

In [38]:
# model = AutoModel.from_pretrained(f'/kaggle/input/{CFG.model_name}',config=config)

In [39]:
# model

In [40]:
model = AutoModelForSequenceClassification.from_pretrained(f'/kaggle/input/{CFG.model_name}',config=config)

In [41]:
model

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

> Can see that a pooling layer followed by Linear Layer with 2 outputs was added to base model

#### Metrics - MCRMSE

In [42]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Train using GroupKFold for CV

<!-- ### Build Trainer -->

In [43]:
train_collator = DataCollatorWithPadding(tokenizer)

In [49]:
# model_fold_dir = './'
# training_args = TrainingArguments(
#     output_dir = model_fold_dir,
#     report_to='none',
#     load_best_model_at_end=True, # select best model
#     learning_rate=CFG.learning_rate,
#     per_device_train_batch_size=CFG.batch_size,
#     per_device_eval_batch_size=CFG.batch_size,
#     num_train_epochs=CFG.num_train_epochs,
#     weight_decay=CFG.weight_decay,
#     greater_is_better=False,
#     metric_for_best_model="mcrmse",
#     save_strategy='no', # "steps",
#     evaluation_strategy='no' #"steps",
# )
## report_to='none' to avoid wandb login - https://discuss.huggingface.co/t/how-to-turn-wandb-off-in-trainer/6237/2
## both save strategy and eval strategy have to match

#### GPU

In [45]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

cuda


In [46]:
# model_gpu = model.to(device)

In [47]:
# model_gpu

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [50]:
# trainer = Trainer(
#     model = model_gpu,
#     train_dataset = tokenized_train_dataset,
#     args = training_args,
#     data_collator = train_collator,
#     tokenizer = tokenizer,
#     compute_metrics = compute_mcrmse    
# )

#### Training loop

In [None]:
# delete old model files
if os.path.exists(CFG.model_name):
    shutil.rmtree(CFG.model_name)
os.mkdir(CFG.model_name)

for fold in range(CFG.n_splits):
    print(f"Fold: {fold}")
    fold_train_data = train[train['fold']!=fold]
    fold_val_data = train[train['fold']==fold]
    fold_train_dataset = Dataset.from_pandas(fold_train_data[text_cols + target_cols])
    fold_val_dataset = Dataset.from_pandas(fold_val_data[text_cols + target_cols])
    fold_train_tokenized = fold_train_dataset.map(generate_tokens,batched=True)
    fold_val_tokenized = fold_val_dataset.map(generate_tokens,batched=True)
    print(f"Number of training examples: {fold_train_tokenized.num_rows}")
    print(f"Number of validation examples: {fold_val_tokenized.num_rows}")
    
    model = AutoModelForSequenceClassification.from_pretrained(f'/kaggle/input/{CFG.model_name}',config=config)
    model_gpu = model.to(device)
    
    model_dir = f"{CFG.model_name}/fold_{fold}"
    model_fold_dir = os.path.join(model_dir, str(fold))
    
    training_args = TrainingArguments(
        output_dir = model_fold_dir,
        report_to='none',
        load_best_model_at_end=True, # select best model
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.num_train_epochs,
        weight_decay=CFG.weight_decay,
        greater_is_better=False,
        metric_for_best_model="mcrmse",
        save_strategy='steps',
        evaluation_strategy='steps',
        save_total_limit=1,
        fp16=True,
        save_steps = CFG.save_steps,
        eval_steps = CFG.eval_steps
    )
    
    trainer = Trainer(
        model = model_gpu,
        train_dataset = fold_train_tokenized,
        eval_dataset = fold_val_tokenized,
        args = training_args,
        data_collator = train_collator,
        tokenizer = tokenizer,
        compute_metrics = compute_mcrmse    
    )
    
    model_gpu.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    model_gpu.cpu()
    del model
    gc.collect()
    torch.cuda.empty_cache()

## Predict

In [None]:
# preds = trainer.predict(tokenized_test_dataset)

In [None]:
# preds

In [None]:
test_args = TrainingArguments(
            output_dir=  model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False
)

In [None]:
infer = Trainer(
    model = model_gpu,
    args = test_args,
    tokenizer = tokenizer,
    data_collator = train_collator
)

In [None]:
# preds2 = infer.predict(tokenized_test_dataset)

In [None]:
# preds2[0]

In [None]:
for fold in range(CFG.n_splits):
    model_dir = f"{CFG.model_name}/fold_{fold}"
    model_fold_dir = os.path.join(model_dir, str(fold))

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)    
    model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
    model_gpu = model.to(device)
    
    test_args = TrainingArguments(
        output_dir=  model_fold_dir,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 4,   
        dataloader_drop_last = False,
        fp16=True
    )
    
    infer = Trainer(
        model = model_gpu,
        args = test_args,
        tokenizer = tokenizer,
        data_collator = train_collator
    )
    
    preds = infer.predict(tokenized_test_dataset)[0]
    test[f"{target_cols[0]}_{fold}"] = preds[:,0]
    test[f"{target_cols[1]}_{fold}"] = preds[:,1]
    
    model_gpu.cpu()
    del model_gpu
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
## Take mean of predictions across all folds
test[target_cols[0]] = test[test[f"{target_cols[0]}_{fold}"] for fold in range(CFG.num_splits)].mean(axis=1)
test[target_cols[1]] = test[test[f"{target_cols[1]}_{fold}"] for fold in range(CFG.num_splits)].mean(axis=1)

## Submission

In [None]:
df_submission = pd.DataFrame()
df_submission['student_id'] = test['student_id']
df_submission['content'] = 0
df_submission['wording'] = 0
df_submission[target_cols[0]] = test[target_cols[0]]
df_submission[target_cols[1]] = test[target_cols[1]]
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission