ref: https://www.kaggle.com/competitions/nbme-score-clinical-patient-notes/discussion/323095

In [2]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [3]:
%%writefile mlm.py

import argparse
import os
import json
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
import torch
from datasets import load_dataset
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoConfig
from transformers import DataCollatorForLanguageModeling, AutoModelForMaskedLM, Trainer
from transformers import TrainingArguments
from transformers.utils import logging
from IPython import embed  # noqa

logging.set_verbosity_info()
logger = logging.get_logger(__name__)
logger.info("INFO")
logger.warning("WARN")
KAGGLE_ENV = True if 'KAGGLE_URL_BASE' in set(os.environ.keys()) else False


print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_DIR = Path('../input/')
if KAGGLE_ENV:
    OUTPUT_DIR = Path('')
    os.environ["WANDB_DISABLED"] = "true"
else:
    OUTPUT_DIR = INPUT_DIR


def get_patient_notes_not_used_train():

    patient_notes = pd.read_csv(INPUT_DIR / 'nbme-score-clinical-patient-notes' / "patient_notes.csv")
    print(patient_notes.shape)
    train = pd.read_csv(INPUT_DIR / 'nbme-score-clinical-patient-notes' / 'train.csv')
    train_pn_num_unique = train['pn_num'].unique()

    train_patient_notes = \
        patient_notes.loc[~patient_notes['pn_num'].isin(train_pn_num_unique), :].reset_index(drop=True)
    valid_patient_notes = \
        patient_notes.loc[patient_notes['pn_num'].isin(train_pn_num_unique), :].reset_index(drop=True)

    print(train_patient_notes.shape)
    print(valid_patient_notes.shape)
    return train_patient_notes, valid_patient_notes


def tokenize_function(examples):
    return tokenizer(examples["text"])


def get_tokenizer(args):
    if 'v3' in str(args.model_path):
        from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast
        print('DebertaV2TokenizerFast')
        tokenizer = DebertaV2TokenizerFast.from_pretrained(INPUT_DIR / args.model_path, trim_offsets=False)
    else:
        if args.model_name:
            print('model_name', args.model_name)
            tokenizer = AutoTokenizer.from_pretrained(args.model_name, trim_offsets=False)
        else:
            print('model_path', args.model_path)
            tokenizer = AutoTokenizer.from_pretrained(INPUT_DIR / args.model_path, trim_offsets=False)
    return tokenizer


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default="", required=False)
    parser.add_argument("--model_path", type=str, default="../input/deberta-v3-large/deberta-v3-large/", required=False)
    parser.add_argument("--seed", type=int, default=0, required=False)
    parser.add_argument('--debug', action='store_true', required=False)
    parser.add_argument('--exp_num', type=str, required=True)
    parser.add_argument("--param_freeze", action='store_true', required=False)
    parser.add_argument("--num_train_epochs", type=int, default=5, required=False)
    parser.add_argument("--batch_size", type=int, default=8, required=False)
    parser.add_argument("--lr", type=float, default=2e-5, required=False)
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, required=False)
    return parser.parse_args()


if __name__ == "__main__":

    args = parse_args()
    train, valid = get_patient_notes_not_used_train()

    if args.debug:
        train = train.iloc[:10, :]
        valid = valid.iloc[:10, :]
        args.batch_seize = 1

    def get_text(df):
        text_list = []
        for text in tqdm(df['pn_history']):
            if len(text) < 30:
                pass
            else:
                text_list.append(text)
        return text_list

    train_text_list = get_text(train)
    valid_text_list = get_text(valid)

    mlm_train_json_path = OUTPUT_DIR / 'train_mlm.json'
    mlm_valid_json_path = OUTPUT_DIR / 'valid_mlm.json'

    for json_path, list_ in zip([mlm_train_json_path, mlm_valid_json_path],
                                [train_text_list, valid_text_list]):
        with open(str(json_path), 'w') as f:
            for sentence in list_:
                row_json = {'text': sentence}
                json.dump(row_json, f)
                f.write('\n')

    datasets = load_dataset(
        'json',
        data_files={'train': str(mlm_train_json_path),
                    'valid': str(mlm_valid_json_path)},
        )

    if mlm_train_json_path.is_file():
        mlm_train_json_path.unlink()
    if mlm_valid_json_path.is_file():
        mlm_valid_json_path.unlink()
    print(datasets["train"][:2])

    tokenizer = get_tokenizer(args)

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=1,
        remove_columns=["text"],
        batch_size=args.batch_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

    if args.model_name:
        print('model_name:', args.model_name)
        model_name = args.model_name
    else:
        print('model_path:', args.model_path)
        model_name = INPUT_DIR / args.model_path
    config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)

    if 'v3' in str(model_name):
        model = transformers.DebertaV2ForMaskedLM.from_pretrained(INPUT_DIR / model_name, config=config)
    else:
        model = AutoModelForMaskedLM.from_pretrained(model_name, config=config)

    if args.param_freeze:
        # if freeze, Write freeze settings here

        # deberta-v3-large
        # model.deberta.embeddings.requires_grad_(False)
        # model.deberta.encoder.layer[:12].requires_grad_(False)

        # deberta-large
        model.deberta.embeddings.requires_grad_(False)
        model.deberta.encoder.layer[:24].requires_grad_(False)

        for name, p in model.named_parameters():
            print(name, p.requires_grad)

    if args.debug:
        save_steps = 100
        args.num_train_epochs = 1
    else:
        save_steps = 100000000

    training_args = TrainingArguments(
        output_dir="output-mlm",
        evaluation_strategy="epoch",
        learning_rate=args.lr,
        weight_decay=0.01,
        save_strategy='no',
        per_device_train_batch_size=args.batch_size,
        num_train_epochs=args.num_train_epochs,
        # report_to="wandb",
        run_name=f'output-mlm-{args.exp_num}',
        # logging_dir='./logs',
        lr_scheduler_type='cosine',
        warmup_ratio=0.2,
        fp16=True,
        logging_steps=500,
        gradient_accumulation_steps=args.gradient_accumulation_steps
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets['valid'],
        data_collator=data_collator,
        # optimizers=(optimizer, scheduler)
    )

    trainer.train()

    if args.model_name == 'microsoft/deberta-xlarge':
        model_name = 'deberta-xlarge'
    elif args.model_name == 'microsoft/deberta-large':
        model_name = 'deberta-large'
    elif args.model_name == 'microsoft/deberta-base':
        model_name = 'deberta-base'
    elif args.model_path == "../input/deberta-v3-large/deberta-v3-large/":
        model_name = 'deberta-v3-large'
    elif args.model_name == 'microsoft/deberta-v2-xlarge':
        model_name = 'deberta-v2-xlarge'
    trainer.model.save_pretrained(OUTPUT_DIR / f'{args.exp_num}_mlm_{model_name}')



Writing mlm.py


In [4]:
!python mlm.py --debug --exp_num 0

tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2
(42146, 3)
(41146, 3)
(1000, 3)
100%|████████████████████████████████████████| 10/10 [00:00<00:00, 91379.17it/s]
100%|███████████████████████████████████████| 10/10 [00:00<00:00, 120525.98it/s]
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-666cc1054dba211c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...
100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 9788.34it/s]
100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 1608.25it/s]
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-666cc1054dba211c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.
100%|████████████████████████████████████████████| 2/2 [00:00<00:00, 955.75it/s]
{'text': ["17-year-old male, has come to the student health clinic complaining of heart pounding

In [5]:
ls 

[0m[01;34m0_mlm_deberta-v3-large[0m/  __notebook_source__.ipynb  mlm.py  [01;34moutput-mlm[0m/
