# Google Colab Notebook - GTP2 / aitextgen Training

In [None]:
# Freeze versions of dependencies for now
!pip install transformers==2.9.1
!pip install pytorch-lightning==0.7.6

!pip install -q aitextgen

import logging
logging.basicConfig(
        format="%(asctime)s — %(levelname)s — %(name)s — %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
    )

from aitextgen import aitextgen
from aitextgen.colab import mount_gdrive, copy_file_from_gdrive
from aitextgen.TokenDataset import TokenDataset, merge_datasets
from aitextgen.utils import build_gpt2_config
from aitextgen.tokenizers import train_tokenizer

Collecting transformers==2.9.1
[?25l  Downloading https://files.pythonhosted.org/packages/22/97/7db72a0beef1825f82188a4b923e62a146271ac2ced7928baa4d47ef2467/transformers-2.9.1-py3-none-any.whl (641kB)
[K     |████████████████████████████████| 645kB 4.7MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 19.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 16.8MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8M

11/10/2020 16:57:23 — INFO — transformers.file_utils — PyTorch version 1.7.0+cu101 available.
11/10/2020 16:57:24 — INFO — transformers.file_utils — TensorFlow version 2.3.0 available.


In [None]:
!nvidia-smi

Tue Nov 10 16:57:26 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Mounting Google Drive
mount_gdrive()

Mounted at /content/drive


In [None]:
# Labeling filename from Google Drive
file_name = 'labeled_seinfeld_dialogue.txt'

In [None]:
# Copying file from Google Drive
copy_file_from_gdrive(file_name)

In [None]:
# Training the tokenizer
train_tokenizer(file_name)

11/10/2020 16:57:56 — INFO — aitextgen.tokenizers — Saving aitextgen-vocab.json and aitextgen-merges.txt to the current directory. You will need both files to build the GPT2Tokenizer.


In [None]:
# Configuring GPT2 
config = build_gpt2_config(vocab_size=5000, 
                           max_length=100, 
                           dropout=0.0,  
                           n_layer=8, 
                           n_head=8)
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.0,
  "bos_token_id": 0,
  "embd_pdrop": 0.0,
  "eos_token_id": 0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 100,
  "n_embd": 768,
  "n_head": 8,
  "n_layer": 8,
  "n_positions": 100,
  "resid_pdrop": 0.0,
  "summary_activation": null,
  "summary_first_dropout": 0.0,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "vocab_size": 5000
}

In [None]:
# Instantiating the model
ai = aitextgen(config=config,
               vocab_file="aitextgen-vocab.json",
               merges_file="aitextgen-merges.txt",
               to_gpu=True)

11/10/2020 16:58:07 — INFO — aitextgen — Constructing GPT-2 model from provided config.
11/10/2020 16:58:09 — INFO — aitextgen — Using a custom tokenizer.


In [None]:
# Training the model
ai.train(file_name,
         line_by_line=False,
         num_steps=5000,
         generate_every=1000,
         save_every=1000,
         save_gdrive=False,
         learning_rate=1e-4,
         batch_size=100,
         )

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=327636.0), HTML(value='')), layout=Layout…

11/10/2020 16:58:23 — INFO — aitextgen.TokenDataset — Encoding 327,636 sets of tokens from labeled_seinfeld_dialogue.txt.





GPU available: True, used: True
11/10/2020 16:58:27 — INFO — lightning — GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]
11/10/2020 16:58:27 — INFO — lightning — CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5000.0), HTML(value='')), layout=Layout(d…

[1m1,000 steps reached: saving model to /trained_model[0m
[1m1,000 steps reached: generating sample texts.[0m
 to be a man?


JERRY: Oh, yeah.


KRAMER: All right. 


JERRY: So, how's the ticket?


KRAMER: Well, I'm getting married to the airport.


ELAINE: Uh, what's the matter?


KRAMER: Well, I just think I'm a wonderful guy.


ELAINE: What?


KRAMER: I can't go to the bathroom.


[1m2,000 steps reached: saving model to /trained_model[0m
[1m2,000 steps reached: generating sample texts.[0m
 a lot of problems.


GEORGE:  You know, if you think about it, I take it, I want you to do me a favor.


JERRY:  No, I'm not gonna get the Cadillac.


KRAMER:  Hey.


JERRY: Hey.


KRAMER: Hey, Kramer.


JERRY: Hey, hey. What's with you?


KRAMER: Oh, I think I had a
[1m3,000 steps reached: saving model to /trained_model[0m
[1m3,000 steps reached: generating sample texts.[0m
.


GEORGE:  Here.


KRAMER: Hey, hey, hey. I dig.


ELAINE: I just remembered... uh..


KRAMER: What?


ELAINE:

11/10/2020 17:56:56 — INFO — aitextgen — Saving trained model pytorch_model.bin to /trained_model


.


You got mothers fur coat for pist.


Oh, whoa, do you think?


Hey, let me ask you something, you know any women we should call her.


Really?


I know what's wrong with her.


What?


She probably never started the nurse.


Yeah, she's great. She's just like me, if she's sing in


In [None]:
# Generating a single output
ai.generate(n=1,
            batch_size=1,
            prompt="ALEX: I want to talk to Jerry and Kramer.",
            temperature=0.4,
            top_p=0.9)

[1mALEX: I want to talk to Jerry and Kramer.[0m


KRAMER:  Well, Kramer.


KATIE: Okay listen, I need you to help me move my refrigerator.


JERRY: What d'you do?


KATIE: It's your first time I've ever given your number.


JERRY: It's just such a good idea.


KATIE: Okay.





In [None]:
# Generating an output to a list
text =ai.generate(n=1,
                  batch_size=1,
                  max_len=16,
                  prompt="ALEX: I don't understand what you are talking about?",
                  temperature=0.8,
                  top_p=0.9,
                  return_as_list=True)

In [None]:
gen_list = text[0].split('\n\n\n')

In [None]:
print(gen_list[0] + '\n\n\n' + gen_list[1])

ALEX: I don't understand what you are talking about? We will be making you!


JERRY: Where do you always?
