In [1]:
! nvidia-smi

Mon Jun 20 06:52:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    51W / 350W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# ! gdown --id 1PUSJ56k93B42XW9oszcPDefM8HeJ2BOg
# ! pip -q uninstall -y kaggle
# ! pip -q install --upgrade pip
# ! pip -q install kaggle --upgrade
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download us-patent-phrase-to-phrase-matching
# ! kaggle datasets download yasufuminakama/cpc-data
# ! kaggle datasets download fankaixie/pppm-abstract

In [4]:
# ! unzip -q /content/us-patent-phrase-to-phrase-matching.zip -d data
# ! rm /content/us-patent-phrase-to-phrase-matching.zip
# ! unzip -q /content/cpc-data.zip -d cpc_data
# ! rm /content/cpc-data.zip
# ! unzip -q /content/pppm-abstract.zip -d data
# ! rm /content/pppm-abstract.zip

In [5]:
# ! pip -q install sentencepiece
# ! pip -q install transformers --upgrade
# ! pip -q install tokenizers --upgrade

## Import Library

In [6]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# author:quincy qiang

import os
import random
import warnings
import pandas as pd

import numpy as np
import torch
from transformers import (AutoModelForMaskedLM,
                          AutoTokenizer, LineByLineTextDataset,
                          DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)

warnings.filterwarnings('ignore')


def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_torch(42)



## Genegate Pretraining Corpus

In [7]:
df=pd.read_csv('/content/data/pppm_abstract.csv')

In [8]:
df=df.dropna().reset_index(drop=True)
df

Unnamed: 0,abstract
0,The subject matters of the invention are: a cr...
1,The present invention relates to the treatment...
2,The invention relates to a composition compris...
3,The invention relates to an improved process f...
4,The present invention relates to a new method ...
...,...
93657,Individual measuring sensors each have a diffe...
93658,"Vehicle (10, 100) comprising at least three dr..."
93659,A customer loyalty programme operated by an op...
93660,An inflatable pipe tube fabric assembly for G-...


In [9]:
with open('corpus.txt','w',encoding='utf-8') as f:
    for ab in df['abstract']:
        f.write(ab+'\n')

## Training Model

In [10]:


model_name = 'microsoft/deberta-v3-large'

model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained('/content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large')

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",  # mention train text file here
    block_size=256)

valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",  # mention valid text file here
    block_size=256)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain",  # select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    evaluation_strategy='steps',
    save_total_limit=2,
    eval_steps=7000,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    load_best_model_at_end=False,
    prediction_loss_only=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset)

trainer.train()
trainer.save_model(f'/content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large')  


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

Step,Training Loss,Validation Loss
7000,1.4078,1.331807
14000,1.2182,1.140601
21000,1.1059,1.03818


Saving model checkpoint to /content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-500
Configuration saved in /content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-500/pytorch_model.bin
Deleting older checkpoint [/content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-4000] due to args.save_total_limit
Saving model checkpoint to /content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-1000
Configuration saved in /content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-large-pretrain/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [/content/drive/MyDrive/pretrained_models/microsoft/deberta-v3-