# Finetuning a GPT2 model to generate YouTube video description data from YouTube title 

## limitation: the data set is quite small so after three epochs some overfitting behaviours start to emerge

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.0MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 19.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████

In [0]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import numpy as np
import os
import random

In [0]:
output_dir = "./drive/My Drive/youtube_gpt2/models"
assert os.path.isdir(output_dir)==True

In [0]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [5]:
device

'cuda'

In [0]:
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
model = model.to(device)

In [0]:
# tokenizer.pad_token = tokenizer.eos_token

In [0]:
FILE_PATH = './drive/My Drive/youtube_gpt2/description.txt'
assert os.path.exists(FILE_PATH)==True

In [0]:
from lm import DescriptionData_v2

In [0]:
dataset = DescriptionData_v2(tokenizer= tokenizer, file_path= FILE_PATH )
description_loader = DataLoader(dataset,batch_size=4,shuffle=True)

In [18]:
len(dataset.examples)

21714

In [0]:
BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 0.00002
WARMUP_STEPS = 10000

In [0]:

model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
script_count = 0
sum_loss = 0.0
batch_count = 0

In [0]:
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch} started" + '=' * 30)
    for idx,script in enumerate(description_loader):
        outputs = model(script.to(device), labels=script.to(device))
        
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        script_count = script_count + 1

        if script_count == BATCH_SIZE:
            script_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()
            
        if batch_count == 200:
            model.eval()
            print(f"sum loss {sum_loss}")
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 1000,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )

            print("Output:\n" + 100 * '-')
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            batch_count = 0
            sum_loss = 0.0
            model.train()



Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2172.811767578125
Output:
----------------------------------------------------------------------------------------------------
0:  conjunction this game, even if the only reason for being in the house is for a fun game with the guys.\n\nI'm not saying that the show is full of bad choices, I'm saying that the show is filled with good choices from the characters. But those people really helped me make the show worth watching.\n\nDon't Forget to Visit The Official Channel: https://www.youtube.com/user/GameOfTheDay\n\nFollow me on Twitter: @gumshout1\nInstagram: @gumshout1\nTwitter: @GumShout2\nFacebook: GumShout2\nTumblr: GumShout\nSnapchat: GumShout2\nMore GumShout Videos at GUMshout YouTube Channel!\n\nGUMSHOUT  HD\nGumShout - HD \nhttp://gumshout.com/videos/11852426\n\nGUMSHOUT\nhttps://youtube.com/gumshout\nhttps://goo.gl/XvY1hc\nhttps://instagram.com/gumshout\nhttps://twitter.com/gumshout\nhttps://www.reddit.com/r/Gumshout/\n\nCheck out more GUMshout videos with some awesome

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2175.030029296875
Output:
----------------------------------------------------------------------------------------------------
0: object with the hashtag #MOVIEINSTAGRAM. #MOVIEINSTAGRAM. #MOVIEINSTAGRAM#MOVIEFACEBOOK. #MOVIEINSTAGRAM#MOVIEINSTAGRAM#MOVIEFACEBOOK. #MOVIEFACEBOOK. #Dumb #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK #Dumb #MOVIEFACEBOOK. #MOVIEFACEBOOK. #Dumb #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVIEFACEBOOK. #MOVI

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2151.1826171875
Output:
----------------------------------------------------------------------------------------------------
0:  pedest, or any other item that I use when I'm not wearing makeup, or I feel more confident around women.
Crazy High Quality Skin Makeup | Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel - Cosmetics - Instagram | Chanel 

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2167.7958984375
Output:
----------------------------------------------------------------------------------------------------
0: Live to join the fun and learn about the game! Subscribe to the league here: http://www.youtube.com/playlist?list=PLJZmCJyHXzdZgJWw5Rc_KmV4UqS0zfB\n\nSubscribe! http://bit.ly/2l8nBbj\n\nGet Official Merch! http://bit.ly/2nqO2nOy
Super Smash Bros. for Wii U – Ultimate Edition | Nintendo just introduced the Wii U Ultimate Edition – for the first time ever! It features a brand new controller and improved gamepad layout, including the new Super Smash Bros. Brawl controller!\n\nFor all of you Nintendo fans out there who have been waiting for some awesome video games for years to come, here are some great videos of the Wii U Ultimate Edition:\nhttps://www.youtube.com/watch?v=3uE5P9gPVHk\nhttps://www.youtube.com/watch?v=RtO7rNfW3Y0\nhttps://www.youtube.com/watch?v=rEoIxFh7dYE\nhttps://www.youtube.com/watch?v=6DzBh3RgC5w&list=PLJZmCJyHXzdZgJWw5Rc_KmV4UqS0zfB\

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2174.31787109375
Output:
----------------------------------------------------------------------------------------------------
0:  membrnophila and Ehrlichia\n• Hernia sinensis, an endemic fungus\n• Gheriya magnifica, an endemic tree in northern Asia\n• Tarragonia annuum, an endemic plant in Japan\n• Tarragonia tarduilla\n• Mycobacterium acutis, an endemic green-leaf bug\n• Mitellidae, an endemic tree in India\n• Hypertricis africanum, an endemic tree in eastern Asia\n• Cetacellobacterium praecum\n• Hypertrophylla pescanaria, an endemic tree in northern Greece\n• Sclerotox-disex, an endemic red plant in Africa\n• Clostridia africanum, an endemic tree in China\n• Daphnia mite, an endemic plant in the northern hemisphere\n• Istamylococcus pachydermata, an endemic green-leaf bug\n• Svalbardia sinensis, an endemic tree in northern China\n• Dacoccus aureus, an endemic tree in northern Europe\n• Mycoplasma, an endemic green-leaf beetle\n• Mina corynsis, an endemic yellow plant in Cen

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2169.78173828125
Output:
----------------------------------------------------------------------------------------------------
0:  pants is definitely a popular option! You'll be able to find one on sale in your local store or online! (Just click on the link below to see a full price on the next page.)\nhttp://bit.ly/1j3qj8Y \nhttps://store.brosoftware.com/\n\nShips with PayPal: \nhttp://bit.ly/S7nYHf \nhttp://bit.ly/4dY3Dv \n\nAbout Brosoftware:\n\n\nThe Bosom brand is known for pioneering the use of innovative high-tech products and products by its employees. We deliver innovative products for our customers in a variety of industries. Additionally, we provide products to more than 60 brands around the world!\nWe rely on you to build the highest quality products for us and our customers and drive our growth. Brosoftware makes the products we use the most informed, the most effective and the most innovative in every situation. We rely on your help to make sure that what we beli

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


sum loss 2155.379150390625
Output:
----------------------------------------------------------------------------------------------------
0:  38.981 + 0.7\n\nConnect with YouTube:\nhttp://youtube.com/c/CJLW9B8Jg\nhttp://youtube.com/c/FVHDsPQ\nhttp://youtube.com/c/2RjDp6xN\n\nLike us on Facebook:\nhttps://www.facebook.com/cjlw9b8jg\nhttp://facebook.com/c/CJLW9B8Jg/videos/50372677709526\nhttp://facebook.com/c/CJLW9B8Jg\n\nCheck out more great videos of JLW9B8Jg:\n\nhttp://www.youtube.com/user/TheFiftyBunNY\nhttp://www.youtube.com/user/MrJLW9B8Jg\nhttp://www.youtube.com/c/FVHDsPQ\n\nLike us on Twitter:\nhttp://twitter.com/CJLW9B8Jg\nhttp://twitter.com/CJLW9B8Jg\nhttp://twitter.com/CJLW9B8Jg\nhttp://twitter.com/CJLW9B8Jg\nhttp://twitter.com/CJLW9B8Jg\nhttp://twitter.com/CJLW9B8Jg\n\nCheck out more awesome videos of JLW9B8Jg:\n\nhttp://www.youtube.com/user/TheFiftyBunNY\nhttp://www.youtube.com/user/MrJLW9B8Jg\nhttp://www.youtube.com/c/FVHDsPQ\n\nGet more JLW9B8Jg:\n\n\nCheck out a variety of 

In [24]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

('./drive/My Drive/youtube_gpt2/models/vocab.json',
 './drive/My Drive/youtube_gpt2/models/merges.txt')

In [0]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [0]:
input_ids = tokenizer.encode('John Lewis Christmas Ad 2017', return_tensors='pt')

In [28]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [29]:
sample_outputs = model.generate(
                        input_ids= input_ids,
                        num_beams= 5,
                        max_length = 1000,
                        top_p=0.85, 
                        num_return_sequences=3
                    )

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
      print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


Output:
----------------------------------------------------------------------------------------------------
0: John Lewis Christmas Ad 2017\nhttps://www.youtube.com/playlist?list=PLgXQXQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQxQx