In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.8 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  At

In [None]:
from transformers import GPT2LMHeadModel,GPT2TokenizerFast,GPTNeoModel

MODEL_NAME = 'distilgpt2' #'distilgpt2' 'gpt2-medium' 'gpt2-small'

tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME,truncation=True,padding=True)
# model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
# to re-train pre-existing model over more data:
model=torch.load('drive/MyDrive/pytorch_hackathon/tagline_generatorv1.pth')

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [None]:
# Declare special tokens for padding and separating the context from the brand statement:
additional_tokens = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<nameinfo>', '<headline>'],
}

# Add these special tokens to the vocabulary and resize model's embeddings:
tokenizer.add_special_tokens(additional_tokens)
model.resize_token_embeddings(len(tokenizer))

# Show the full list of special tokens:
print(tokenizer.special_tokens_map)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': ['<nameinfo>', '<headline>']}


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset

class DatasetEncoding(Dataset):
  def __init__(self, filename, tokenizer, seq_length=64):

    context_token = tokenizer.additional_special_tokens_ids[0]
    slogan_token = tokenizer.additional_special_tokens_ids[1]
    pad_token = tokenizer.pad_token_id
    eos_token = tokenizer.eos_token_id # signifies the end of string

    self.examples = []

    df=pd.read_csv(filename)
    for index,row in df.iterrows():
    
      # encode the context and slogan segments in the needed format:
      context = [context_token] + tokenizer.encode(str(row['name']), max_length=seq_length//2-1)
      slogan = [slogan_token] + tokenizer.encode(str(row['headline']), max_length=seq_length//2-2) + [eos_token]
      
      # Concatenate the two parts together:
      tokens = context + slogan + [pad_token] * ( seq_length - len(context) - len(slogan) )

      # Annotate each token with its corresponding segment:
      segments = [context_token] * len(context) + [slogan_token] * ( seq_length - len(context) )

      # Ignore the context, padding, and <slogan> tokens by setting their labels to -100:
      labels = [-100] * (len(context)+1) + slogan[1:] + [-100] * ( seq_length - len(context) - len(slogan) )

      # Add the preprocessed examples to the dataset:
      self.examples.append((tokens, segments, labels))

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, item):
    return torch.tensor(self.examples[item])


# Passing our data for encoding into the appopriate format: 
slogan_dataset = DatasetEncoding('branding_compiled_dataset.csv', tokenizer)
print(next(iter(slogan_dataset)).size())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([3, 64])


In [None]:
import math, random
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

# Split data for training and validation splits with a random factor to prevent overfitting cases:

indices = list(range(len(slogan_dataset)))
random.seed(42)
random.shuffle(indices)

split = math.floor(0.1 * len(slogan_dataset))
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

# declaring data loaders:
train_loader = DataLoader(slogan_dataset, batch_size=32, sampler=train_sampler)
val_loader = DataLoader(slogan_dataset, batch_size=64, sampler=val_sampler)

In [None]:
import numpy as np
from tqdm import tqdm


def fit(model, optimizer, train_dl, val_dl, epochs=1, device=torch.device('cpu')):

  for i in range(epochs):

    print('--- Starting epoch #{} ---\n\n'.format(i))

    model.train()

    losses = []
    nums = []

    for xb in tqdm(train_dl, desc="Training"):
      inputs = xb.to(device)

      outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
    
      loss = outputs[0]
      losses.append(loss.item())
      nums.append(len(xb))

      loss.backward()

      optimizer.step()
      model.zero_grad()

    # compute average cost over one epoch:
    train_cost = np.sum(np.multiply(losses, nums)) / sum(nums)


    # validation:
    model.eval()
    
    with torch.no_grad():
      losses = []
      nums = []

      for xb in tqdm(val_dl, desc="Validation"):
        inputs = xb.to(device)
        outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
        losses.append(outputs[0].item())
        nums.append(len(xb))

    val_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

    print('\n--- Epoch #{} finished --- Training cost: {} \n Validation cost: {}'.format(i, train_cost, val_cost))


In [None]:
from torch.optim import AdamW

# Move the model to the GPU:
device = torch.device('cuda')
model.to(device)

# Fine-tune GPT2 for 3 epochs:
optimizer = AdamW(model.parameters())
fit(model, optimizer, train_loader, val_loader, epochs=3, device=device)

--- Starting epoch #0 ---




Training: 100%|██████████| 400/400 [04:16<00:00,  1.56it/s]
Validation: 100%|██████████| 23/23 [00:09<00:00,  2.51it/s]



--- Epoch #0 finished --- Training cost: 1.6407307751933684 
 Validation cost: 3.155403196307975
--- Starting epoch #1 ---




Training: 100%|██████████| 400/400 [04:17<00:00,  1.56it/s]
Validation: 100%|██████████| 23/23 [00:09<00:00,  2.46it/s]



--- Epoch #1 finished --- Training cost: 1.107714400273539 
 Validation cost: 3.526501333881432
--- Starting epoch #2 ---




Training: 100%|██████████| 400/400 [04:17<00:00,  1.55it/s]
Validation: 100%|██████████| 23/23 [00:09<00:00,  2.50it/s]


--- Epoch #2 finished --- Training cost: 0.8498410701154917 
 Validation cost: 3.8414702113245576





In [None]:
torch.save(model,'drive/MyDrive/pytorch_hackathon/retrained_tagline_generatorv1_compiled.pth')

In [None]:
# Sampling functions with top k and top p from HuggingFace:

import torch.nn.functional as F
from tqdm import trange


def top_k_top_p_ordering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or (top-p) filtering.
        Args:
            logits: batch size x vocabulary size
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
        Snippet taken from: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check

    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability greater than the top_p threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


# snippet from HuggingFace, adapted to work for contextual separation:
def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,):
    context = torch.tensor(context, dtype=torch.long, device='cpu')
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            if segments_tokens != None:
              inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1)


            outputs = model(**inputs) 
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_ordering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # sampling (greedy):
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


In [None]:
context = "Tesla, fast luxurious innovation electric cars"

context_tkn = tokenizer.additional_special_tokens_ids[0]
slogan_tkn = tokenizer.additional_special_tokens_ids[1]

input_ids = [context_tkn] + tokenizer.encode(context)

segments = [slogan_tkn] * 64
segments[:len(input_ids)] = [context_tkn] * len(input_ids)

input_ids += [slogan_tkn]

# Move the model back to the CPU for inference:
model.to(torch.device('cpu'))

# Generate 20 samples of max length 20
generated = sample_sequence(model, length=20, context=input_ids, segments_tokens=segments, num_samples=5)

print('\n\n--- Generated Marketing statements ---\n')

for g in generated:
  slogan = tokenizer.decode(g.squeeze().tolist())
  # print(slogan)
  slogan = slogan.split('<|endoftext|>')[0].split('<headline>')[1]
  print(slogan)  

100%|██████████| 20/20 [00:06<00:00,  2.99it/s]



--- Generated Marketing statements ---

Who could ask for anything more?
Knowing how you take care of your car.
 7 Star. Carrodisiac.
Talk about business.
The new way of transportation.





inference script:

In [None]:
import torch.nn.functional as F
from tqdm import trange
import torch
from transformers import GPT2TokenizerFast

def top_k_top_p_ordering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or (top-p) filtering.
        Args:
            logits: batch size x vocabulary size
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
        Snippet taken from: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check

    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability greater than the top_p threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


# snippet from HuggingFace, adapted to work for contextual separation:
def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,):
    context = torch.tensor(context, dtype=torch.long, device='cpu')
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            if segments_tokens != None:
              inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1)


            outputs = model(**inputs) 
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_ordering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # sampling (greedy):
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated



tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2',truncation=True,padding=True)
model=torch.load('drive/MyDrive/pytorch_hackathon/retrained_tagline_generatorv1_compiled.pth')

extra_tokens = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<nameinfo>', '<headline>']
}


tokenizer.add_special_tokens(extra_tokens)
model.resize_token_embeddings(len(tokenizer))


context = "Tesla, fast luxurious electric cars"

context_tkn = tokenizer.additional_special_tokens_ids[0]
slogan_tkn = tokenizer.additional_special_tokens_ids[1]

input_ids = [context_tkn] + tokenizer.encode(context)

segments = [slogan_tkn] * 64
segments[:len(input_ids)] = [context_tkn] * len(input_ids)

input_ids += [slogan_tkn]

# Move the model back to the CPU for inference:
model.to(torch.device('cpu'))

# Generate 20 samples of max length 20
generated = sample_sequence(model, length=20, context=input_ids, segments_tokens=segments, num_samples=5)

for g in generated:
  slogan = tokenizer.decode(g.squeeze().tolist())
  slogan = slogan.split('<|endoftext|>')[0].split('<headline>')[1]
  print(slogan)  

100%|██████████| 20/20 [00:06<00:00,  3.22it/s]

The power of choice.
Giving you the transport you need.
Only 5 Star is the clinic.
Who could ask for anything more?
Emotional luxury.



