# GPT-2 Fine-Tuning

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step 2. Model Training

In [1]:
!pip install -q transformers

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [3]:
!cp ./drive/MyDrive/agribrain/corpus/corpus.txt . 

In [4]:
def load_dataset(file_path, tokenizer, block_size = 512):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs, 
          max_steps=500
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset, 
  )
      
  trainer.train()
  trainer.save_model()

In [5]:
# you need to set parameters 
train_file_path = "./corpus.txt"
model_name = 'gpt2'
output_dir = './agbrain'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 2
save_steps = 5

In [6]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,3.8955


In [7]:
!mkdir ./drive/MyDrive/agribrain/gpt2-core/

!cp -r ./agbrain ./drive/MyDrive/agribrain/gpt2-core/

mkdir: cannot create directory ‘./drive/MyDrive/agribrain/gpt2-core/’: File exists


In [8]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

In [9]:
model = TFGPT2LMHeadModel.from_pretrained("agbrain", from_pt=True)
tokenizer = GPT2Tokenizer.from_pretrained("agbrain")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.6.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.5.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'lm_head.weight', 'transformer.h.0.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.11.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

In [10]:
model.save_pretrained("tf-agbrain")
model.config.save_pretrained("tf-agbrain")

In [11]:
!mkdir ./finetuned/

In [12]:
!mv -f ./tf-agbrain/* ./finetuned/
!mv -f ./agbrain/*.* ./finetuned/

In [17]:
!mkdir ./drive/MyDrive/agribrain/aicore/
!cp -r ./finetuned ./drive/MyDrive/agribrain/aicore/

In [18]:
!mv ./drive/MyDrive/agribrain/aicore/finetuned ./drive/MyDrive/agribrain/aicore/agbrain