# GPT-2 Fine-Tuning

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step 2. Model Training

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size = 512):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs, 
          max_steps=1000
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset, 
  )
      
  trainer.train()
  trainer.save_model()

In [None]:
!git config --global credential.helper store
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
!huggingface-cli repo create agbrain

[90mgit version 2.25.1[0m
[90mgit-lfs/2.9.2 (GitHub; linux amd64; go 1.13.5)[0m

You are about to create [1mbenkimz/agbrain[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/benkimz/agbrain[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/benkimz/agbrain



In [None]:
!git clone https://huggingface.co/benkimz/agbrain

Cloning into 'agbrain'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), 420 bytes | 420.00 KiB/s, done.


In [None]:
# you need to set parameters 
train_file_path = "../corpus.txt"
model_name = 'benkimz/agbrain'
output_dir = './'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 2
save_steps = 100

In [None]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]



Step,Training Loss
500,3.6885
1000,3.6372


In [None]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

In [None]:
model = TFGPT2LMHeadModel.from_pretrained("./", from_pt=True)
tokenizer = GPT2Tokenizer.from_pretrained("./")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['transformer.h.5.attn.masked_bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.4.attn.masked_bias', 'transformer.h.9.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.7.attn.masked_bias', 'transformer.h.8.attn.masked_bias', 'transformer.h.6.attn.masked_bias', 'lm_head.weight', 'transformer.h.11.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassifica

In [None]:
model.save_pretrained("./")

In [None]:
%cd ..

/content


In [None]:
!mv -f ./agbrain ./drive/MyDrive/agribrain/

In [None]:
%cd ./drive/MyDrive/agribrain/agbrain/

/content/drive/MyDrive/agribrain/agbrain


In [None]:
!ls

agbrain			merges.txt	   special_tokens_map.json  vocab.json
config.json		pytorch_model.bin  tf_model.h5
generation_config.json	README.md	   tokenizer_config.json


In [None]:
!git clone https://huggingface.co/benkimz/agbrain

Cloning into 'agbrain'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (3/3), 420 bytes | 15.00 KiB/s, done.


In [None]:
%cd ..

/content/drive/MyDrive/agribrain/agbrain


In [None]:
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/config.json ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/generation_config.json ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/merges.txt ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/pytorch_model.bin ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/README.md ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/special_tokens_map.json ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/tf_model.h5 ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/tokenizer_config.json ./agbrain/
!cp ./drive/MyDrive/agribrain/agbrain/agbrain/vocab.json ./agbrain/

In [None]:
%cd ./agbrain/

/content/agbrain


In [None]:
!git config --global user.email benkim3619@gmail.com
!git config --global user.name benkimz

In [None]:
!git lfs track "*.h5"
!git lfs track "*.bin"

"*.h5" already supported
"*.bin" already supported


In [None]:
!git add . 

In [None]:
!git commit -m "Initial commit for AgriBrain's AI-core, agbrain"

[main 2f7bb82] Initial commit for AgriBrain's AI-core, agbrain
 9 files changed, 100501 insertions(+)
 create mode 100644 README.md
 create mode 100644 config.json
 create mode 100644 generation_config.json
 create mode 100644 merges.txt
 create mode 100644 pytorch_model.bin
 create mode 100644 special_tokens_map.json
 create mode 100644 tf_model.h5
 create mode 100644 tokenizer_config.json
 create mode 100644 vocab.json


In [None]:
!git push --force origin main

Uploading LFS objects: 100% (2/2), 1.0 GB | 26 MB/s, done.
Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 2 threads
Compressing objects: 100% (11/11), done.
Writing objects: 100% (11/11), 523.76 KiB | 1.15 MiB/s, done.
Total 11 (delta 0), reused 0 (delta 0)
To https://huggingface.co/benkimz/agbrain
   9382e93..2f7bb82  main -> main
