In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Installing offline libraries

In [None]:
!python -m pip install --no-index --find-links=../input/autocorrect-offline-install -r ../input/autocorrect-offline-install/requirements.txt

## Libraries

In [None]:
import numpy as np 
import pandas as pd
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
import gc

In [None]:
import random
import os
import warnings
import logging
import shutil
from tqdm import tqdm
from datasets import disable_progress_bar
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
class CFG:
#     model_name="debertav3base"
    model_name="deberta-v3-large/deberta-v3-large"
    debug=False
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=8 #4,8
    random_seed=42
    save_steps=500 #100
    max_length=512 #1024
    n_freeze_layers=6 #12

In [None]:
# SEED 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(CFG.random_seed)

## Loading Data

In [None]:
BASE_DIR = "/kaggle/input"

In [None]:
WORKING_DIR = "/kaggle/working"

In [None]:
!mkdir {WORKING_DIR}/commonlit-oof-preds

In [None]:
DATA_DIR = f"{BASE_DIR}/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [None]:
train = pd.merge(prompts_train,summaries_train,on='prompt_id')

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits) # Since 4 prompts in training set

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

In [None]:
train.groupby("fold").count()

In [None]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=10, random_state=42).reset_index(drop=True)
    display(train.groupby('fold').size())

## Tokenizer and Collator

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f'{BASE_DIR}/{CFG.model_name}')

In [None]:
train_collator = DataCollatorWithPadding(tokenizer)

In [None]:
## Figure out generating labels columns for batch of thousand
def generate_tokens(examples: pd.DataFrame,mode='train',text_col='text'):
    encodings = tokenizer(examples[text_col],truncation=True,max_length=CFG.max_length,return_tensors='np')
    if mode == 'test':
        return encodings
    labels = np.column_stack((examples['content'],examples['wording']))
    return {**encodings, "labels": labels}

In [None]:
def generate_tokens_for_single_target(examples: pd.DataFrame,target: str,mode='train',text_col='text'):
    encodings = tokenizer(examples[text_col],truncation=True,max_length=CFG.max_length,return_tensors='np')
    if mode == 'test':
        return encodings
    labels = examples[target]
    return {**encodings, "labels": labels}

## Preparing Text cols

In [None]:
from autocorrect import Speller
speller = Speller(lang='en')

In [None]:
sep = tokenizer.sep_token
train = train.applymap(lambda s: s.lower() if type(s)==str else s)
train['corrected_summary_text'] = train["text"].progress_apply(speller)
train['full_text'] = train['prompt_title'] + sep + train['prompt_question'] + sep + train['corrected_summary_text']

In [None]:
target_cols = ['content','wording']
text_col = 'full_text' #'text'
text_cols = [text_col]

## Pre-trained Models

In [None]:
CONFIG_MULTI = {
    "debertav3base1" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/weights/weights/debertav3base",
    "debertav3large1" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/deberta-v3-large/deberta-v3-large/deberta-v3-large",
    "debertav3large2" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/deberta-v3-large-freeze-6/deberta-v3-large-freeze-6"
}

In [None]:
multi_models = ["debertav3base1","debertav3large1","debertav3large2"]

In [None]:
CONFIG_IND = {
    "debertav3base_ind_content" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/v3_small_individual_targets/v3_small_individual_targets/content/debertav3base",
    "debertav3base_ind_wording" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/v3_small_individual_targets/v3_small_individual_targets/wording/debertav3base",
    "debertav3large_ind_content" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/deberta-v3-large-ind-autocorrect/commonlit-deberta-v3-weights/content/deberta-v3-large-ind",
    "debertav3large_ind_wording" : f"{BASE_DIR}/debertav3large-weights-for-commonlit/deberta-v3-large-ind-autocorrect/commonlit-deberta-v3-weights/wording/deberta-v3-large-ind",
    "debertav3large_freeze_18_ind_content" : f"{BASE_DIR}/commonlit-deberta-v3-ind-freeze-18/content/deberta-v3-large-ind" ,
    "debertav3large_freeze_18_ind_wording": f"{BASE_DIR}/commonlit-deberta-v3-ind-freeze-18/wording/deberta-v3-large-ind",
    "debertav3large_1024_ind_content": f"{BASE_DIR}/deberta-v3-weights-ind-seq-len-1024/content/deberta-v3-large-ind",
    "debertav3large_1024_ind_wording": f"{BASE_DIR}/deberta-v3-weights-ind-seq-len-1024/wording/deberta-v3-large-ind",
    "debertav3large_freeze_18_step_eval_ind_content": f"{BASE_DIR}/debertav3-large-freeze-18-steps-eval/content/deberta-v3-large-ind",
    "debertav3large_freeze_18_step_eval_ind_wording": f"{BASE_DIR}/debertav3-large-freeze-18-steps-eval/wording/deberta-v3-large-ind"
}

In [None]:
ind_models = ["debertav3base_ind","debertav3large_ind","debertav3large_freeze_18_ind","debertav3large_1024_ind","debertav3large_freeze_18_step_eval_ind"]

## Metrics

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## GPU setup

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.empty_cache()
print(device)

In [None]:
gc.collect()
torch.cuda.empty_cache()

## Predictions on train set 

In [None]:
# for multi_model in multi_models:
#     print(f'Model: {multi_model}')
#     oof_preds = pd.DataFrame()
#     for fold in range(CFG.n_splits):
#         val_data = train[train['fold']==fold]
#         val_dataset = Dataset.from_pandas(val_data[text_cols+target_cols])
#         tokenized_val_dataset = val_dataset.map(lambda x: generate_tokens(x,text_col=text_col),batched=True)

#         pretrained_model_dir = f"{CONFIG_MULTI[multi_model]}/fold_{fold}"
#         model_dir = f"{multi_model}/fold_{fold}"
#         model_fold_dir = os.path.join(model_dir, str(fold))

#         model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_dir)
#         model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
#         model_gpu = model.to(device)

#         test_args = TrainingArguments(
#             output_dir=  model_fold_dir,
#             do_train = False,
#             do_predict = True,
#             per_device_eval_batch_size = CFG.batch_size,
#             dataloader_drop_last = False,
# #             fp16=True
#         )

#         infer = Trainer(
#             model = model_gpu,
#             args = test_args,
#             tokenizer = tokenizer,
#             data_collator = train_collator
#         )

#         preds = infer.predict(tokenized_val_dataset)[0]
#         train.loc[val_data.index,f"{multi_model}_content_pred"] = preds[:,0]
#         train.loc[val_data.index,f"{multi_model}_wording_pred"] = preds[:,1]

#         model_gpu.cpu()
#         del model_gpu
#         del model
#         gc.collect()
#         torch.cuda.empty_cache()

#     print(f"Saving {multi_model} oof preds in csv file.")
#     for target in target_cols:
#         oof_preds[target] = train[f"{multi_model}_{target}_pred"]
#     oof_preds.to_csv(f"{WORKING_DIR}/commonlit-oof-preds/{multi_model}_oof_preds.csv",index=False)

In [None]:
# for ind_model in ind_models:
#     print(f'Model: {ind_model}')
#     oof_preds=pd.DataFrame()
#     for target in target_cols:
#         for fold in range(CFG.n_splits):
#             val_data = train[train['fold']==fold]
#             val_dataset = Dataset.from_pandas(val_data[text_cols+target_cols])
#             tokenized_val_dataset = val_dataset.map(lambda x: generate_tokens_for_single_target(x,target,text_col=text_col),batched=True)
#             pretrained_model_dir = f"{CONFIG_IND[f'{ind_model}_{target}']}/fold_{fold}"
#             model_dir =  f"{target}/{ind_model}/fold_{fold}"
#             model_fold_dir = os.path.join(model_dir, str(fold))

#             model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_dir)
#             model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
#             model_gpu = model.to(device)

#             test_args = TrainingArguments(
#                 output_dir=  model_fold_dir,
#                 do_train = False,
#                 do_predict = True,
#                 per_device_eval_batch_size = CFG.batch_size,
#                 dataloader_drop_last = False,
# #                 fp16=True
#             )

#             infer = Trainer(
#                 model = model_gpu,
#                 args = test_args,
#                 tokenizer = tokenizer,
#                 data_collator = train_collator
#             )

#             preds = infer.predict(tokenized_val_dataset)[0]
#             train.loc[val_data.index,f"{ind_model}_{target}_pred"] = preds

#             model_gpu.cpu()
#             del model_gpu
#             gc.collect()
#             torch.cuda.empty_cache()
#         oof_preds[target] = train[f"{ind_model}_{target}_pred"]
#     print(f"Saving {ind_model} oof preds in csv file.")
#     oof_preds['student_id'] = train['student_id']
#     oof_preds.to_csv(f"{WORKING_DIR}/commonlit-oof-preds/{ind_model}_oof_preds.csv",index=False)

## Predictions on test set

In [None]:
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
test = pd.merge(prompts_test,summaries_test,on='prompt_id')

In [None]:
test = test.applymap(lambda s: s.lower() if type(s)==str else s)
test['corrected_summary_text'] = test["text"].progress_apply(speller)
test['full_text'] = test['prompt_title'] + sep + test['prompt_question'] + sep + test['text']

In [None]:
test_dataset = Dataset.from_pandas(test[text_cols])
tokenized_test_dataset = test_dataset.map(lambda x: generate_tokens(x,'test','full_text'),batched=True)

In [None]:
for multi_model in multi_models:
    print(f'Model: {multi_model}')
    test_preds = pd.DataFrame()
    for fold in range(CFG.n_splits):
        pretrained_model_dir = f"{CONFIG_MULTI[multi_model]}/fold_{fold}"
        model_dir = f"{multi_model}/fold_{fold}"
        model_fold_dir = os.path.join(model_dir, str(fold))

        model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_dir)
        model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
        model_gpu = model.to(device)

        test_args = TrainingArguments(
            output_dir=  model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.batch_size,
            dataloader_drop_last = False,
            fp16=True
        )

        infer = Trainer(
            model = model_gpu,
            args = test_args,
            tokenizer = tokenizer,
            data_collator = train_collator
        )

        preds = infer.predict(tokenized_test_dataset)[0]
        test[f"{multi_model}_content_pred"] = preds[:,0]
        test[f"{multi_model}_wording_pred"] = preds[:,1]

        model_gpu.cpu()
        del model_gpu
        del model
        gc.collect()
        torch.cuda.empty_cache()

    print(f"Saving {multi_model} test preds in csv file.")
    for target in target_cols:
        test_preds[target] = test[f"{multi_model}_{target}_pred"]
    test_preds['student_id'] = test['student_id']
    test_preds.to_csv(f"{multi_model}_test_preds.csv",index=False)

In [None]:
for ind_model in ind_models:
    print(f'Model: {ind_model}')
    test_preds=pd.DataFrame()
    for target in target_cols:
        for fold in range(CFG.n_splits):
            pretrained_model_dir = f"{CONFIG_IND[f'{ind_model}_{target}']}/fold_{fold}"
            model_dir =  f"{target}/{ind_model}/fold_{fold}"
            model_fold_dir = os.path.join(model_dir, str(fold))

            model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_dir)
            model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
            model_gpu = model.to(device)

            test_args = TrainingArguments(
                output_dir=  model_fold_dir,
                do_train = False,
                do_predict = True,
                per_device_eval_batch_size = CFG.batch_size,
                dataloader_drop_last = False,
                fp16=True
            )

            infer = Trainer(
                model = model_gpu,
                args = test_args,
                tokenizer = tokenizer,
                data_collator = train_collator
            )

            preds = infer.predict(tokenized_test_dataset)[0]
            test[f"{ind_model}_{target}_pred"] = preds

            model_gpu.cpu()
            del model_gpu
            gc.collect()
            torch.cuda.empty_cache()
        test_preds[target] = test[f"{ind_model}_{target}_pred"]
    print(f"Saving {ind_model} test preds in csv file.")
    test_preds['student_id'] = test['student_id']
    test_preds.to_csv(f"{ind_model}_test_preds.csv",index=False)

## Hill Cimbing

In [None]:
# Change BASE_DIR to WORKING_DIR when running debug samples
scores = {}
for multi_model in multi_models:
    scores[multi_model] = compute_mcrmse((pd.read_csv(f'{BASE_DIR}/commonlit-oof-preds/{multi_model}_oof_preds.csv')[target_cols].values,train[target_cols].values))

In [None]:
for ind_model in ind_models:
      scores[ind_model] = compute_mcrmse((pd.read_csv(f'{BASE_DIR}/commonlit-oof-preds/{ind_model}_oof_preds.csv')[target_cols].values,train[target_cols].values))

In [None]:
scores

#### Content

In [None]:
scores_content = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1]['content_rmse'])}

In [None]:
scores_content

In [None]:
oof_df_content = pd.DataFrame()
test_df_content = pd.DataFrame()

for item in scores_content.items():
    oof_df_content[item[0]]=pd.read_csv(f'{BASE_DIR}/commonlit-oof-preds/{item[0]}_oof_preds.csv')['content'].values
    test_df_content[item[0]]=pd.read_csv(f'{WORKING_DIR}/{item[0]}_test_preds.csv')['content'].values

In [None]:
# Initialise
y_content = train['content'].values
STOP = False
current_best_ensemble_content = oof_df_content.iloc[:,0]
current_best_test_preds_content = test_df_content.iloc[:,0]
MODELS = oof_df_content.iloc[:,1:]
# weight_range = np.arange(0.01,0.51,0.01)   # or with negative weights: np.arange(-0.5,0.51,0.01)
weight_range = np.arange(-0.5,0.51,0.01)   # or with negative weights: np.arange(-0.5,0.51,0.01)
history = [compute_metrics((current_best_ensemble_content,y_content))['rmse']]
i=0

# Hill climbing
while not STOP:
    i+=1
    potential_new_best_cv_score = compute_metrics((current_best_ensemble_content,y_content))['rmse']
    k_best, wgt_best = None, None
    for k in MODELS:
        for wgt in weight_range:
            potential_ensemble = (1-wgt) * current_best_ensemble_content + wgt * MODELS[k]
            cv_score = compute_metrics((potential_ensemble,y_content))['rmse']
            if cv_score < potential_new_best_cv_score:
                potential_new_best_cv_score = cv_score
                k_best, wgt_best = k, wgt

    if k_best is not None:
        current_best_ensemble_content = (1-wgt_best) * current_best_ensemble_content + wgt_best * MODELS[k_best]
        current_best_test_preds_content = (1-wgt_best) * current_best_test_preds_content + wgt_best * test_df_content[k_best]
        MODELS.drop(k_best, axis=1, inplace=True)
        if MODELS.shape[1]==0:
            STOP = True
        print(f'Iteration: {i}, Model added: {k_best}, Best weight: {wgt_best:.2f}, Best RMSE: {potential_new_best_cv_score:.5f}')
        history.append(potential_new_best_cv_score)
    else:
        STOP = True

In [None]:
scores_wording = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1]['wording_rmse'],reverse=False)}

In [None]:
scores_wording

In [None]:
oof_df_wording = pd.DataFrame()
test_df_wording = pd.DataFrame()
for item in scores_wording.items():
    oof_df_wording[item[0]]=pd.read_csv(f'{BASE_DIR}/commonlit-oof-preds/{item[0]}_oof_preds.csv')['wording'].values
    test_df_wording[item[0]]=pd.read_csv(f'{WORKING_DIR}/{item[0]}_test_preds.csv')['wording'].values

In [None]:
# Initialise
y_wording = train['wording'].values
STOP = False
current_best_ensemble_wording = oof_df_wording.iloc[:,0]
current_best_test_preds_wording = test_df_wording.iloc[:,0]
MODELS = oof_df_wording.iloc[:,1:]
# weight_range = np.arange(0.01,0.51,0.01)   # or with negative weights: np.arange(-0.5,0.51,0.01)
weight_range = np.arange(-0.5,0.51,0.01)   # or with negative weights: np.arange(-0.5,0.51,0.01)
history = [compute_metrics((current_best_ensemble_wording,y_wording))['rmse']]
i=0

# Hill climbing
while not STOP:
    i+=1
    potential_new_best_cv_score = compute_metrics((current_best_ensemble_wording,y_wording))['rmse']
    k_best, wgt_best = None, None
    for k in MODELS:
        for wgt in weight_range:
            potential_ensemble = (1-wgt) * current_best_ensemble_wording + wgt * MODELS[k]
            cv_score = compute_metrics((potential_ensemble,y_wording))['rmse']
            if cv_score < potential_new_best_cv_score:
                potential_new_best_cv_score = cv_score
                k_best, wgt_best = k, wgt

    if k_best is not None:
        current_best_ensemble_wording = (1-wgt_best) * current_best_ensemble_wording + wgt_best * MODELS[k_best]
        current_best_test_preds_wording = (1-wgt_best) * current_best_test_preds_wording + wgt_best * test_df_wording[k_best]
        MODELS.drop(k_best, axis=1, inplace=True)
        if MODELS.shape[1]==0:
            STOP = True
        print(f'Iteration: {i}, Model added: {k_best}, Best weight: {wgt_best:.2f}, Best RMSE: {potential_new_best_cv_score:.5f}')
        history.append(potential_new_best_cv_score)
    else:
        STOP = True

In [None]:
best_ensemble_preds_train = np.column_stack((current_best_ensemble_content,current_best_ensemble_wording))

In [None]:
ensemble_preds=pd.DataFrame()
ensemble_preds['content'] = current_best_ensemble_content
ensemble_preds['wording'] = current_best_ensemble_wording
ensemble_preds['student_id'] = train['student_id']
ensemble_preds.to_csv('ensemble_oof_preds.csv',index=False)

In [None]:
print(f'Best mcrmse: {compute_mcrmse((best_ensemble_preds_train,train[target_cols].values))}')

## Submission

In [None]:
df_submission = pd.DataFrame()
df_submission['student_id'] = test['student_id']
df_submission['content'] = current_best_test_preds_content
df_submission['wording'] = current_best_test_preds_wording
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission.head(4)