In [1]:
# %load_ext autoreload
# %autoreload 2
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import os
import sys
# Modify path so that we can import local modules into notebook
module_path = os.path.abspath(os.path.join('./utils')) 
sys.path.insert(0, module_path)

from utils.exp_util import load_model, get_tokenizer, init_model, load_config
from utils.data_util import get_datasets
from utils.generate_util import generate_sample, generate_beam_sample

In [2]:
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED)
import random
random.seed(RANDOM_SEED)

## Zero-shot Learning

In [8]:
base_model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [13]:
sequences = base_model.generate(do_sample=True,  top_k=50,  max_length = 100,top_p=0.95, num_return_sequences=2)
for i, tokens in enumerate(sequences):

    print(f"\nOUTPUT {i}:\n {tokenizer.decode(tokens, skip_special_tokens=True)}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



OUTPUT 0:
 The National Park Service has officially closed the West Indian River between Iquique and Champa Bay in the wake of deadly floods which have washed across part of the state.

The National Oceanic and Atmospheric Administration announced that the closure of the river has started and that the flood water from the river has been used to irrigate farmland in Champa Bay.

The agency also said the floodwaters used to irrigate farmland in the Champa Bay area had been removed from the water flow


OUTPUT 1:
 I'll be happy to meet you.


Dear Mr. President,

I hope it's good to hear your opinion. I was going to say that you're in this for a reason. You feel that way when someone's asking for your help, but I hope to hear that from you and my family. I don't know why.

Why are you here? What do you think I need you for? I don't have what you do for, but I understand



## Open-ended generation: Finetuned Model

In [18]:
special_tokens_dict = {'pad_token': '<|pad|>', 'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>'}

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.add_special_tokens(special_tokens_dict)

2

In [20]:
experiment_name = "no_context"
config = load_config(experiment_name)

model = load_model(tokenizer, config, 10)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50259, bias=False)
)

In [21]:
prompt = torch.tensor(tokenizer.encode(tokenizer.bos_token)).unsqueeze(0)

sequences = model.generate(prompt, do_sample=True,  top_k=50,  max_length = 100,top_p=0.95, num_return_sequences=2)
for i, tokens in enumerate(sequences):

    print(f"\nOUTPUT {i}:\n {tokenizer.decode(tokens, skip_special_tokens=True)}\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



OUTPUT 0:
 Pulmonologist and registered dietitian Dr. Pamela Tarrant, MD, discusses the benefits of seafood, along with her two daughters, Zoey and Brianna. Click here to watch.


OUTPUT 1:
 Royal Bank of Canada decreased its stake in shares of UBS Group AG (NYSE:UBS - Get Rating) by 2.1% during the third quarter, according to its most recent disclosure with the SEC. The institutional investor owned 46,334 shares of the financial services provider's stock after selling 4,226 shares during the quarter. Royal Bank [...]



## Generating Descriptions (Given Title)

In [5]:
experiment_name = "desc_target"
desc_config = load_config(experiment_name)

In [7]:
tokenizer = get_tokenizer(desc_config["GPT_SIZE"])
_, _, test_dataset = get_datasets(desc_config, tokenizer)

print(f"{len(test_dataset)} test samples")

10000 articles loaded.
6461 samples after cleaning
646 test samples


In [5]:
model = load_model(tokenizer, desc_config, 5)

In [6]:
device = torch.device("cuda:0") 
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50259, bias=False)
)

In [7]:
generate_sample(test_dataset, tokenizer, model, num=3, device=device, top_k=20, top_p=0.8, length=100)

**************************************************
CONTEXT:
 New Indwell affordable housing building 'kind of project we need': Mayor

GENERATED TEXT:
 (CNN) -- The world is watching a man who claims to be God and who says he has the power to stop time and time again. ButWhen I first heard about the new movie, "The Man Who Built the Tower," I had to try it out. It's a very good movie. I like it a lot, but I don't know if it's worth the $20.In his recent book, "The End of Man," philosopher Martin Heidegger
TRUE TARGET:
 There was a time in Shawn Russwurm's life when he was down to his last bit of money and living out of his car. "I was almost facing homelessness and life on the streets," he said. But on Tuesday, the 55-year old Londoner stood before a large group at the grand opening of Indwell's Embassy [...]    

**************************************************
CONTEXT:
 Vinicius es uno de los mejores del mundo en este momento, segun Ancelotti

GENERATED TEXT:
  es el mundo, con uno

## Generating Headlines

In [8]:
experiment_name = "title_target"
title_config = load_config(experiment_name)

_, _, test_title_dataset = get_datasets(title_config, tokenizer)

10000 articles loaded.
6461 samples after cleaning


In [6]:
model = load_model(tokenizer, title_config, 5)

device = torch.device("cuda:0") 
model.to(device)
model.eval()

In [8]:
generate_sample(test_title_dataset, tokenizer, model, num=3, device=device, top_k=20, top_p=0.8, length=15)

**************************************************
CONTEXT:
 There was a time in Shawn Russwurm's life when he was down to his last bit of money and living out of his car. "I was almost facing homelessness and life on the streets," he said. But on Tuesday, the 55-year old Londoner stood before a large group at the grand opening of Indwell's Embassy [...]

GENERATED TEXT:
 A new poll shows that the Canadian public isalmost evenly divided on
TRUE TARGET:
 New Indwell affordable housing building 'kind of project we need': Mayor    

**************************************************
CONTEXT:
 MADRID, 1 mar (Reuters) - El entrenador del Real Madrid, Carlo Ancelotti, dijo que Vinicius Jr es uno de los mejores jugadores del mundo en este momento, anadiendo que no es una preocupacion que el equipo sea tan dependiente de la forma del brasileno esta temporada. Se espera que el futbolista de 22 anos, clave en el doblete Liga-Champions del Real Madrid de la temporada pasada, guie a su equipo a un