# **Training**

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [16]:
!mkdir ./data && mv ./train.json ./data && mv ./valid.json ./data && mkdir checkpoint

In [7]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling ,TextDataset

In [6]:
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [17]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='./data/train.json',
    block_size=32
)
valid_dataset = TextDataset(
    tokenizer=tokenizer, 
    file_path="./data/valid.json", 
    block_size=32
)



In [19]:
training_args = TrainingArguments(
    output_dir="./checkpoint",
    overwrite_output_dir=True,
    num_train_epochs=30,
    save_strategy="epoch",
    logging_strategy ="epoch",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    save_total_limit=2
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                           mlm=False)
trainer = Trainer(model=model.to("cuda"),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=collator,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 4798
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 9000
  Number of trainable parameters = 354823168


Epoch,Training Loss,Validation Loss
1,3.585,4.004187
2,2.4311,4.197192
3,1.4345,4.889631


***** Running Evaluation *****
  Num examples = 450
  Batch size = 16
Saving model checkpoint to ./checkpoint/checkpoint-300
Configuration saved in ./checkpoint/checkpoint-300/config.json
Configuration saved in ./checkpoint/checkpoint-300/generation_config.json
Model weights saved in ./checkpoint/checkpoint-300/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 450
  Batch size = 16
Saving model checkpoint to ./checkpoint/checkpoint-600
Configuration saved in ./checkpoint/checkpoint-600/config.json
Configuration saved in ./checkpoint/checkpoint-600/generation_config.json
Model weights saved in ./checkpoint/checkpoint-600/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 450
  Batch size = 16
Saving model checkpoint to ./checkpoint/checkpoint-900
Configuration saved in ./checkpoint/checkpoint-900/config.json
Configuration saved in ./checkpoint/checkpoint-900/generation_config.json
Model weights saved in ./checkpoint/checkpoint-900/pytorch_model.bin
Deletin

In [None]:
!mkdir ./model_weight && mkdir ./tokenizer_weight

In [None]:
model.save_pretrained("./model_weight")
tokenizer.save_pretrained("./tokenizer_weight")

In [None]:
!mv ./model_weight ./drive/MyDrive/ && mv ./tokenizer_weight ./drive/MyDrive 

# **Inference**

In [None]:
from transformers import pipeline


GENERATOR = pipeline('text-generation', model='./model/', tokenizer="./tokenizer")

In [None]:
output = GENERATOR(txt, max_length=50, num_return_sequences=3, num_beams=3, no_repeat_ngram_size=2, early_stopping=True)