<a href="https://colab.research.google.com/github/aayu24/Kaggle/blob/master/accelerate_tpu_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
!pip install cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp310-cp310-linux_x86_64.whl

In [None]:
from google.colab import files
files.upload()         # expire any previous token(s) and upload recreated token

In [2]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets list

In [None]:
!kaggle competitions download -c commonlit-evaluate-student-summaries

In [None]:
!mkdir commonlit-evaluate-student-summaries

In [None]:
!unzip commonlit-evaluate-student-summaries.zip -d commonlit-evaluate-student-summaries/

In [None]:
!pip install accelerate

In [None]:
!pip install autocorrect==1.1.0

In [None]:
!pip install transformers datasets

In [None]:
!pip install sentencepiece

## Imports

In [1]:
import accelerate
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
import gc

In [2]:
import random
import os
import warnings
import logging
import shutil
from tqdm import tqdm
from datasets import disable_progress_bar
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
disable_progress_bar()
tqdm.pandas()

In [3]:
BASE_DIR = "/content"

In [4]:
WORKING_DIR = BASE_DIR

In [5]:
class CFG:
    # model_name="debertav3base"
    model_name="deberta-v3-large"
    name = "deberta-v3-large-ind"
    learning_rate=1.5e-5 #1e-5
    weight_decay=0.03
    hidden_dropout_prob=0.0 #0.0
    attention_probs_dropout_prob=0.0 #0.0
    num_train_epochs=5
    n_splits=4
    batch_size=4 #1,4,8
    random_seed=42 #42,102
    save_steps=100 #500
    max_length=512 #1024
    n_freeze_layers=6 #6,10,12

In [6]:
# SEED 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(CFG.random_seed)

## Load Data

In [7]:
DATA_DIR = f"{BASE_DIR}/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [8]:
train = pd.merge(prompts_train,summaries_train,on='prompt_id')

In [9]:
gkf = GroupKFold(n_splits=CFG.n_splits) # Since 4 prompts in training set

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

In [10]:
train.groupby("fold").count()

Unnamed: 0_level_0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2057,2057,2057,2057,2057,2057,2057,2057
1.0,2009,2009,2009,2009,2009,2009,2009,2009
2.0,1996,1996,1996,1996,1996,1996,1996,1996
3.0,1103,1103,1103,1103,1103,1103,1103,1103


## Pipeline components

In [11]:
tokenizer = AutoTokenizer.from_pretrained(f'microsoft/{CFG.model_name}')

In [12]:
train_collator = DataCollatorWithPadding(tokenizer)

In [13]:
config = AutoConfig.from_pretrained(f'microsoft/{CFG.model_name}')
config.update({
    "num_labels": 1, #2
    "problem_type": 'regression',
    "hidden_dropout_prob": CFG.hidden_dropout_prob,
    "attention_probs_dropout_prob": CFG.attention_probs_dropout_prob
})

In [14]:
config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "problem_type": "regression",
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.33.3",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

## Util functions

In [15]:
import torch

In [31]:
## Figure out generating labels columns for batch of thousand
def generate_tokens(examples: pd.DataFrame,mode='train',text_col='text'):
    encodings = tokenizer(examples[text_col],truncation=True,max_length=CFG.max_length,return_tensors='pt',padding='max_length')
    if mode == 'test':
        return encodings
    labels = torch.from_numpy(np.column_stack((examples['content'],examples['wording'])))
    return {**encodings, "labels": labels}

In [40]:
def generate_tokens_for_single_target(examples: pd.DataFrame,target: str,mode='train',text_col='text'):
    encodings = tokenizer(examples[text_col],truncation=True,max_length=CFG.max_length,return_tensors='pt',padding='max_length')
    if mode == 'test':
        return encodings
    labels = torch.Tensor(examples[target])
    return {**encodings, "labels": labels}

In [18]:
type(tokenizer(train.loc[0]['text'],return_tensors='np'))

transformers.tokenization_utils_base.BatchEncoding

In [19]:
type(tokenizer(train.loc[0]['text'],return_tensors='pt'))

transformers.tokenization_utils_base.BatchEncoding

In [36]:
type(train['text'].to_numpy())

numpy.ndarray

## Preparing text cols

In [20]:
from autocorrect import Speller
speller = Speller(lang='en')

In [21]:
sep = tokenizer.sep_token
train = train.applymap(lambda s: s.lower() if type(s)==str else s)
train['corrected_summary_text'] = train["text"].progress_apply(speller)
train['full_text'] = train['prompt_title'] + sep + train['prompt_question'] + sep + train['corrected_summary_text']

100%|██████████| 7165/7165 [05:51<00:00, 20.41it/s]


In [22]:
target_cols = ['content','wording']
text_col = 'full_text' #'text'
text_cols = [text_col]

## Metrics

In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels.numpy(), predictions.numpy(), squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = torch.sqrt(torch.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)

    return (content_score + wording_score)/2

GPU setup

In [24]:
# from torch import cuda
# device = 'cuda' if cuda.is_available() else 'cpu'
# cuda.empty_cache()
# print(device)

In [25]:
gc.collect()
# torch.cuda.empty_cache()

4

## Training debertav3large on individual columns

In [26]:
# For the different target models
for target in target_cols:
    if os.path.exists(target):
        shutil.rmtree(target)
    os.mkdir(target)

In [27]:
# gc.collect()
# torch.cuda.empty_cache()

In [28]:
from accelerate.utils import find_executable_batch_size

In [41]:
def run(train,CFG,target_cols,text_cols,text_col,generate_tokens_for_single_target,BASE_DIR,config):
    accelerator = Accelerator(mixed_precision="bf16")
    @find_executable_batch_size(starting_batch_size=CFG.batch_size)
    def inner_loop(batch_size):
        nonlocal accelerator
        for target in target_cols:
            accelerator.print(f"Target: {target}")
            for fold in range(CFG.n_splits):
                accelerator.print(f"Fold: {fold}")
                fold_train_data = train[train['fold']!=fold]
                fold_val_data = train[train['fold']==fold]
                fold_train_dataset = Dataset.from_pandas(fold_train_data[text_cols + target_cols])
                fold_val_dataset = Dataset.from_pandas(fold_val_data[text_cols + target_cols])
                fold_train_tokenized = fold_train_dataset.map(lambda x: generate_tokens_for_single_target(x,target,text_col=text_col),batched=True)
                fold_val_tokenized = fold_val_dataset.map(lambda x: generate_tokens_for_single_target(x,target,text_col=text_col),batched=True)
                accelerator.print(f"Number of training examples: {fold_train_tokenized.num_rows}")
                accelerator.print(f"Number of validation examples: {fold_val_tokenized.num_rows}")
                gc.collect()
    #             nonlocal accelerator # Ensure they can be used in our context
                accelerator.free_memory() # Free all lingering references

    #             accelerator = Accelerator(mixed_precision='fp16')

                model = AutoModelForSequenceClassification.from_pretrained(f'microsoft/{CFG.model_name}',config=config)
                # freezing embeddings layer
                model.base_model.embeddings.requires_grad_(False)

                # freezing the initial N layers
                for k, param in model.base_model.encoder.layer.named_parameters():
                    l = int(k.split(".")[0])
                    if l < CFG.n_freeze_layers:
                        param.requires_grad = False

                model_gpu = model.to(accelerator.device)

                model_dir =  f"{target}/{CFG.name}/fold_{fold}"
                model_fold_dir = os.path.join(model_dir, str(fold))

                training_args = TrainingArguments(
                    output_dir = model_fold_dir,
                    report_to='none',
                    load_best_model_at_end=True, # select best model
                    learning_rate=CFG.learning_rate,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size*2,
                    num_train_epochs=CFG.num_train_epochs,
                    weight_decay=CFG.weight_decay,
                    greater_is_better=False,
                    metric_for_best_model="rmse", #mcrmse
                    save_strategy='epoch', #steps
                    evaluation_strategy='epoch',
                    save_total_limit=1,
                    gradient_accumulation_steps=4,
                    # gradient_checkpointing=True,
                    optim='adafactor',
    #                 fp16=True,
                    # save_steps = CFG.save_steps,
                    # eval_steps = CFG.save_steps
                )

                trainer = Trainer(
                    model = model_gpu,
                    train_dataset = fold_train_tokenized,
                    eval_dataset = fold_val_tokenized,
                    args = training_args,
                    data_collator = train_collator,
                    tokenizer = tokenizer,
                    compute_metrics = compute_metrics  #compute_mcrmse
                )

                trainer.train()
                trainer.save_model(model_dir)

        ##         Not needed since trainer saves everything - https://discuss.huggingface.co/t/what-is-the-purpose-of-save-pretrained/9167/2
        #         model_gpu.save_pretrained(model_dir)
        #         tokenizer.save_pretrained(model_dir)
    #             model_gpu.cpu()
                del model_gpu
                del model
                gc.collect()
                accelerator.free_memory()
        #         torch.cuda.empty_cache()
    inner_loop()
    accelerator.free_memory()

In [None]:
from accelerate import Accelerator, notebook_launcher
notebook_launcher(run,args=(train,CFG,target_cols,text_cols,text_col,generate_tokens_for_single_target,BASE_DIR,config))

## CV

In [None]:
for target in target_cols:
    for fold in range(CFG.n_splits):
        val_data = train[train['fold']==fold]
        val_dataset = Dataset.from_pandas(val_data[text_cols+target_cols])
        tokenized_val_dataset = val_dataset.map(lambda x: generate_tokens_for_single_target(x,target,text_col=text_col),batched=True)

        model_dir =  f"{target}/{CFG.name}/fold_{fold}"
        model_fold_dir = os.path.join(model_dir, str(fold))

        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
        model_gpu = model.to(device)

        test_args = TrainingArguments(
            output_dir=  model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.batch_size,
            dataloader_drop_last = False,
            fp16=True
        )

        infer = Trainer(
            model = model_gpu,
            args = test_args,
            tokenizer = tokenizer,
            data_collator = train_collator
        )

        preds = infer.predict(tokenized_val_dataset)[0]
        train.loc[val_data.index,f"{target}_pred"] = preds

        model_gpu.cpu()
        del model_gpu
        gc.collect()
        torch.cuda.empty_cache()

In [None]:
print(compute_mcrmse((train[['content_pred','wording_pred']].values,train[target_cols].values)))

#### Save train cols output

In [None]:
oof_preds = pd.DataFrame()
for target in target_cols:
    oof_preds[target] = train[f"{target}_pred"]
oof_preds['student_id'] = train['student_id']
oof_preds.to_csv(f'debertav3large_ind_oof_preds.csv',index=False)

## Predict

In [None]:
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")

In [None]:
test=pd.merge(prompts_test,summaries_test,on='prompt_id')

In [None]:
test['full_text'] = test['prompt_title'] + sep + test['prompt_question'] + sep + test['text']

In [None]:
test_dataset = Dataset.from_pandas(test[text_cols])

In [None]:
tokenized_test_dataset = test_dataset.map(lambda x: generate_tokens(x,'test','full_text'),batched=True)

In [None]:
for target in target_cols:
    for fold in range(CFG.n_splits):
        model_dir =  f"{target}/{CFG.name}/fold_{fold}"
        model_fold_dir = os.path.join(model_dir, str(fold))

        model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        model.eval() # Set default model mode to evaluation - https://huggingface.co/docs/transformers/main_classes/model
        model_gpu = model.to(device)

        test_args = TrainingArguments(
            output_dir=  model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = CFG.batch_size,
            dataloader_drop_last = False,
            fp16=True
        )

        infer = Trainer(
            model = model_gpu,
            args = test_args,
            tokenizer = tokenizer,
            data_collator = train_collator
        )

        preds = infer.predict(tokenized_test_dataset)[0]
        test[f"{target}_{fold}"] = preds

        model_gpu.cpu()
        del model_gpu
        gc.collect()
        torch.cuda.empty_cache()

In [None]:
## Take mean of predictions across all folds
for target in target_cols:
    test[target] =test[[f"{target}_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

## Submission

In [None]:
df_submission = pd.DataFrame()
df_submission['student_id'] = test['student_id']
df_submission['content'] = 0
df_submission['wording'] = 0
df_submission[target_cols[0]] = test[target_cols[0]]
df_submission[target_cols[1]] = test[target_cols[1]]
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission.head(4)

## Next steps

*  changing save strategy to steps instead of epoch
*  setting dropout probs=0.0
*  full_text_2 = question + title + text helps.
*  once model with best score, using different random seeds (42,102)






