In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reference Notebooks - 
1. https://www.kaggle.com/code/tsunotsuno/debertav3-lgbm-no-autocorrect for the CV strategy
2. https://www.kaggle.com/code/olegpush/commonlit-tune-hugging-face-model-for-beginners for getting Baseline model ready
3. https://www.kaggle.com/code/chumajin/pytorch-bert-beginner-s-room for General understanding of Transformer Library output

#### Imports

In [None]:
import numpy as np 
import pandas as pd
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold

In [None]:
import random
import os
import warnings
import logging
import shutil
from tqdm import tqdm
from datasets import disable_progress_bar
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
class CFG:
#     model_name="debertav3base"
    model_name="deberta-v3-large/deberta-v3-large"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=4 #8
    random_seed=42
    save_steps=100 #500
    max_length=512 #1024
    n_freeze_layers=6

In [None]:
# SEED 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(CFG.random_seed)

## Load Data

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [None]:
train = pd.merge(prompts_train,summaries_train,on='prompt_id')
test=pd.merge(prompts_test,summaries_test,on='prompt_id')

In [None]:
train.head(2)

In [None]:
test.head(2)

#### Split train data using GroupKFolds on prompt_id
Since test dataset will have new prompts, hence task becomes to train model to perform well on new/unseen prompts - https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/discussion/425409#2357563

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits) # Since 4 prompts in training set

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

In [None]:
train.groupby("fold").count()

#### Prepare huggingface dataset
Ref - https://huggingface.co/docs/datasets/v2.14.5/en/tabular_load#pandas-dataframes

In [None]:
train_dataset = Dataset.from_pandas(train[['text'] + ['content','wording'] + ['fold']])

In [None]:
train_dataset

In [None]:
import gc
gc.collect()

## Define Model and Metrics

#### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f'/kaggle/input/{CFG.model_name}')

In [None]:
tokenizer.encode_plus(train_dataset[0]['text'])

In [None]:
train_dataset[0]

In [None]:
## Figure out generating labels columns for batch of thousand
def generate_tokens(examples: pd.DataFrame,mode='train',text_col='text'):
    encodings = tokenizer(examples[text_col],truncation=True,max_length=CFG.max_length,return_tensors='np')
    if mode == 'test':
        return encodings
    labels = np.column_stack((examples['content'],examples['wording']))
    return {**encodings, "labels": labels}

# tokenized_train_dataset = train_dataset.map(generate_tokens,batched=True)

In [None]:
# tokenized_train_dataset

In [None]:
sep = tokenizer.sep_token
train['full_text'] = train['prompt_title'] + sep + train['prompt_question'] + sep + train['text']
test['full_text'] = test['prompt_title'] + sep + test['prompt_question'] + sep + test['text']

In [None]:
tokenizer(train.loc[0]['full_text'])

In [None]:
tokenizer.decode(tokenizer(train.loc[0]['full_text'])['input_ids'])

In [None]:
train.loc[0]['full_text']

#### Config

In [None]:
config = AutoConfig.from_pretrained(f'/kaggle/input/{CFG.model_name}')

In [None]:
# config

In [None]:
config.update({
    "num_labels": 2,
    "problem_type": 'regression',
    "hidden_dropout_prob": CFG.hidden_dropout_prob,
    "attention_probs_dropout_prob": CFG.attention_probs_dropout_prob
})

In [None]:
config

#### Model

In [None]:
# model = AutoModel.from_pretrained(f'/kaggle/input/{CFG.model_name}',config=config)

In [None]:
# model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(f'/kaggle/input/{CFG.model_name}',config=config)

In [None]:
model

> Can see that a pooling layer followed by Linear Layer with 2 outputs was added to base model

In [None]:
# freezing embeddings layer
model.base_model.embeddings.requires_grad_(False)

# freezing the initial N layers
for k, param in model.base_model.encoder.layer.named_parameters():
    l = int(k.split(".")[0])
    if l < CFG.n_freeze_layers:
        param.requires_grad = False

In [None]:
#You can confirm which layers have been frozen and see the whole layer struct of the model
# for n, p in model.named_parameters():
#     print(n, p.requires_grad)

In [None]:
gc.collect()

#### Metrics - MCRMSE

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Train using GroupKFold for CV

In [None]:
train_collator = DataCollatorWithPadding(tokenizer)

In [None]:
# model_fold_dir = './'
# training_args = TrainingArguments(
#     output_dir = model_fold_dir,
#     report_to='none',
#     load_best_model_at_end=True, # select best model
#     learning_rate=CFG.learning_rate,
#     per_device_train_batch_size=CFG.batch_size,
#     per_device_eval_batch_size=CFG.batch_size,
#     num_train_epochs=CFG.num_train_epochs,
#     weight_decay=CFG.weight_decay,
#     greater_is_better=False,
#     metric_for_best_model="mcrmse",
#     save_strategy='no', # "steps",
#     evaluation_strategy='no' #"steps",
# )
## report_to='none' to avoid wandb login - https://discuss.huggingface.co/t/how-to-turn-wandb-off-in-trainer/6237/2
## both save strategy and eval strategy have to match

#### GPU

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

In [None]:
gc.collect()

In [None]:
torch.cuda.empty_cache()

#### Training loop

### Use prompt question + prompt text + text to generate embeddings

In [None]:
target_cols = ['content','wording']
text_col = 'full_text' #'text'
text_cols = [text_col]

In [None]:
# delete old model files
if os.path.exists('deberta-v3-large'):
    shutil.rmtree('deberta-v3-large')
os.mkdir('deberta-v3-large')
# if os.path.exists(CFG.model_name):
#     shutil.rmtree(CFG.model_name)
# os.mkdir(CFG.model_name)

### GPU training loop

In [None]:
for fold in range(CFG.n_splits):
    print(f"Fold: {fold}")
    fold_train_data = train[train['fold']!=fold]
    fold_val_data = train[train['fold']==fold]
    fold_train_dataset = Dataset.from_pandas(fold_train_data[text_cols + target_cols])
    fold_val_dataset = Dataset.from_pandas(fold_val_data[text_cols + target_cols])
    fold_train_tokenized = fold_train_dataset.map(lambda x: generate_tokens(x,text_col=text_col),batched=True)
    fold_val_tokenized = fold_val_dataset.map(lambda x: generate_tokens(x,text_col=text_col),batched=True)
    print(f"Number of training examples: {fold_train_tokenized.num_rows}")
    print(f"Number of validation examples: {fold_val_tokenized.num_rows}")
    gc.collect()
    
    model = AutoModelForSequenceClassification.from_pretrained(f'/kaggle/input/{CFG.model_name}',config=config)
    # freezing embeddings layer
    model.base_model.embeddings.requires_grad_(False)

    # freezing the initial N layers
    for k, param in model.base_model.encoder.layer.named_parameters():
        l = int(k.split(".")[0])
        if l < CFG.n_freeze_layers:
            param.requires_grad = False
            
    model_gpu = model.to(device)
    
    model_dir = f"{CFG.model_name}/fold_{fold}"
    model_fold_dir = os.path.join(model_dir, str(fold))
    
    training_args = TrainingArguments(
        output_dir = model_fold_dir,
        report_to='none',
        load_best_model_at_end=True, # select best model
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.num_train_epochs,
        weight_decay=CFG.weight_decay,
        greater_is_better=False,
        metric_for_best_model="mcrmse",
        save_strategy='steps',
        evaluation_strategy='steps',
        save_total_limit=1,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        optim='adafactor',
#         fp16=True,
        save_steps = CFG.save_steps,
        eval_steps = CFG.save_steps
    )
    
    trainer = Trainer(
        model = model_gpu,
        train_dataset = fold_train_tokenized,
        eval_dataset = fold_val_tokenized,
        args = training_args,
        data_collator = train_collator,
        tokenizer = tokenizer,
        compute_metrics = compute_mcrmse    
    )
    
    trainer.train()
    
    model_gpu.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    model_gpu.cpu()
    del model
    gc.collect()
    torch.cuda.empty_cache()

#### Combine these outputs with numerical text based feats and feed into LGBM

#### Add each fold model predictions to training data

In [None]:
for fold in range(CFG.n_splits):
    val_data = train[train['fold']==fold]
    val_dataset = Dataset.from_pandas(val_data[text_cols+target_cols])
    tokenized_val_dataset = val_dataset.map(lambda x: generate_tokens(x,text_col=text_col),batched=True)
    
    model_dir = f"{CFG.model_name}/fold_{fold}"
    model_fold_dir = os.path.join(model_dir, str(fold))

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)    
    model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
    model_gpu = model.to(device)
    
    test_args = TrainingArguments(
        output_dir=  model_fold_dir,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = CFG.batch_size,   
        dataloader_drop_last = False,
        fp16=True
    )
    
    infer = Trainer(
        model = model_gpu,
        args = test_args,
        tokenizer = tokenizer,
        data_collator = train_collator
    )
    
    preds = infer.predict(tokenized_val_dataset)[0]
    train.loc[val_data.index,"content_pred"] = preds[:,0]
    train.loc[val_data.index,"wording_pred"] = preds[:,1]
    
    model_gpu.cpu()
    del model_gpu
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
train.head(2)

In [None]:
print(compute_mcrmse((train[['content_pred','wording_pred']].values,train[target_cols].values)))

### CV Score

In [None]:
for fold in range(CFG.n_splits):
    val_data = train
    val_dataset = Dataset.from_pandas(val_data[text_cols+target_cols])
    tokenized_val_dataset = val_dataset.map(lambda x: generate_tokens(x,'test',text_col=text_col),batched=True)
    
    model_dir = f"{CFG.model_name}/fold_{fold}"
    model_fold_dir = os.path.join(model_dir, str(fold))

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)    
    model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
    model_gpu = model.to(device)
    
    test_args = TrainingArguments(
        output_dir=  model_fold_dir,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = CFG.batch_size,   
        dataloader_drop_last = False,
        fp16=True
    )
    
    infer = Trainer(
        model = model_gpu,
        args = test_args,
        tokenizer = tokenizer,
        data_collator = train_collator
    )
    
    preds = infer.predict(tokenized_val_dataset)[0]
    train.loc[val_data.index,f"content_pred_{fold}"] = preds[:,0]
    train.loc[val_data.index,f"wording_pred_{fold}"] = preds[:,1]
    
    model_gpu.cpu()
    del model_gpu
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
train.head(5)

In [None]:
## Take mean of predictions across all folds
train[f'{target_cols[0]}_pred_mean'] = train[[f"{target_cols[0]}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
train[f'{target_cols[1]}_pred_mean'] = train[[f"{target_cols[1]}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
print(compute_mcrmse((train[['content_pred_mean','wording_pred_mean']].values,train[target_cols].values)))

In [None]:
# import re
# # Text cleaning function
# def clean_text(text):
#     text = text.lower()
#     text = re.sub(r'\n', ' ', text)
#     text = re.sub(r'\W', ' ', text)
#     text = re.sub(r'\s+', ' ', text)
#     return text

In [None]:
# # Enhanced preprocessing function
# def preprocess_data(data: pd.DataFrame):
#     merged_df = data.copy()
#     text_columns = ['prompt_question', 'prompt_title', 'prompt_text', 'text']
#     for column in text_columns:
#         merged_df[column] = merged_df[column].apply(clean_text)
#     merged_df['prompt_length'] = merged_df['prompt_text'].apply(len)
#     merged_df['summary_length'] = merged_df['text'].apply(len)
# #     merged_df['prompt_unique_words_cnt'] = merged_df['prompt_text'].apply(lambda x: len(set(x.split())))
# #     merged_df['summary_unique_words_cnt'] = merged_df['text'].apply(lambda x: len(set(x.split())))
# #     merged_df['summary_stopwords_cnt'] = merged_df['text'].apply(lambda x: count_stopwords(x))
# #     merged_df['num_typos'] = merged_df['text'].apply(lambda x: get_num_typos(x))
# #     merged_df['length_ratio'] = merged_df['summary_length']/merged_df['prompt_length']
#     return merged_df

In [None]:
# enh_train = preprocess_data(train)

In [None]:
# enh_text_cols = ['prompt_id', 'student_id', 'prompt_question', 'prompt_title', 'prompt_text', 'text']
# cols_to_drop = enh_text_cols + target_cols + ['fold']

In [None]:
# enh_train.head()

In [None]:
# from xgboost import XGBRegressor
# import lightgbm as lgb
# model_dict = {}
# for target in target_cols:
#     models = []
#     for fold in range(CFG.n_splits):
#         enh_train_data = enh_train[enh_train['fold']!=fold]
#         enh_val_data = enh_train[enh_train['fold']==fold]
#         fold_X_train = enh_train_data.drop(columns=cols_to_drop)
#         fold_y_train = enh_train_data[target]
#         fold_X_val = enh_val_data.drop(columns=cols_to_drop)
#         fold_y_val = enh_val_data[target]
        
#         dtrain = lgb.Dataset(fold_X_train, label=fold_y_train)
#         dval = lgb.Dataset(fold_X_val, label=fold_y_val)

#         params = {
#                   'boosting_type': 'gbdt',
#                   'random_state': 42,
#                   'objective': 'regression',
#                   'metric': 'rmse',
#                   'learning_rate': 0.05,
#                   }

#         evaluation_results = {}
#         model = lgb.train(params,
#                           num_boost_round=10000,
#                             #categorical_feature = categorical_features,
#                           valid_names=['train', 'valid'],
#                           train_set=dtrain,
#                           valid_sets=dval,
#                           callbacks=[
#                               lgb.early_stopping(stopping_rounds=30, verbose=True),
#                                lgb.log_evaluation(100),
#                               lgb.callback.record_evaluation(evaluation_results)
#                             ],
#                           )
#         models.append(model)
#     model_dict[target] = models

In [None]:
# # cv
# rmses = []

# for target in target_cols:
#     models = model_dict[target]

#     preds = []
#     trues = []
    
#     for fold, model in enumerate(models):
#         X_eval_cv = enh_train[enh_train["fold"] == fold].drop(columns=cols_to_drop)
#         y_eval_cv = enh_train[enh_train["fold"] == fold][target]

#         pred = model.predict(X_eval_cv)

#         trues.extend(y_eval_cv)
#         preds.extend(pred)
    
#     rmse = np.sqrt(mean_squared_error(trues, preds))
#     print(f"{target}_rmse : {rmse}")
#     rmses = rmses + [rmse]

# print(f"mcrmse : {sum(rmses) / len(rmses)}")

## Predict

In [None]:
test_dataset = Dataset.from_pandas(test[text_cols])

In [None]:
tokenized_test_dataset = test_dataset.map(lambda x: generate_tokens(x,'test','full_text'),batched=True)

In [None]:
tokenized_test_dataset

In [None]:
for fold in range(CFG.n_splits):
    model_dir = f"{CFG.model_name}/fold_{fold}"
    model_fold_dir = os.path.join(model_dir, str(fold))

    model = AutoModelForSequenceClassification.from_pretrained(model_dir)    
    model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
    model_gpu = model.to(device)
    
    test_args = TrainingArguments(
        output_dir=  model_fold_dir,
        do_train = False,
        do_predict = True,
        per_device_eval_batch_size = 4,   
        dataloader_drop_last = False,
        fp16=True
    )
    
    infer = Trainer(
        model = model_gpu,
        args = test_args,
        tokenizer = tokenizer,
        data_collator = train_collator
    )
    
    preds = infer.predict(tokenized_test_dataset)[0]
    test[f"{target_cols[0]}_{fold}"] = preds[:,0]
    test[f"{target_cols[1]}_{fold}"] = preds[:,1]
    
    model_gpu.cpu()
    del model_gpu
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
test

In [None]:
## Take mean of predictions across all folds
test[target_cols[0]] = test[[f"{target_cols[0]}_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
test[target_cols[1]] = test[[f"{target_cols[1]}_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

## Submission

In [None]:
df_submission = pd.DataFrame()
df_submission['student_id'] = test['student_id']
df_submission['content'] = 0
df_submission['wording'] = 0
df_submission[target_cols[0]] = test[target_cols[0]]
df_submission[target_cols[1]] = test[target_cols[1]]
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission

### Possible Next Steps for model improvement
1. Have used base model. From multiple comments, it might be worth considering using v3large model - https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/discussion/424330
2. Add some numerical features, combine with preds by base model and feed into XGB/LGBM for training
3. Consider not just text, but also other text cols for generating embeddings. (prompt_question + title + text) 
4. Add some more numerical features before using LGBM like semantic similarity - https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/discussion/436187
5. Do some text cleaning before generating embeddings.

| Approach | CV | LB |
| -------- | -- | -- |
| 1. v3large        |    | 0.483 |
| 2.        |    |     |
| 3. mult text cols       |    |   |

### Challenges faced
1. Faced Cuda out of memory errors many times while training DebertaV3Large - Had to reduce batch size,along with using gradient accumulation, checkpointing etc. Good ref - https://huggingface.co/docs/transformers/v4.18.0/en/performance 