# Génération de texte avec GPT-2

Génération conditionnelle de texte à l'aide d'un modèle auto-regressif de la librairie GPT/GPT2

## Licence

In [6]:
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Commençons !

In [None]:
# coding=utf-8

import logging
from tqdm import trange

import torch
import torch.nn.functional as F
import numpy as np

import random

!pip install transformers

from transformers import GPT2Config
from transformers import GPT2LMHeadModel, GPT2Tokenizer

Ajoutons du texte à l'invitation (*prompt*) qui sera faite plus tard afin d'aider Transformer-XL et XLNet lorsque l'invitation reçue est courte, tel que mentionné par Aman Rusia dans https://github.com/rusiaaman/XLNet-gen#methodology et dans https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e

In [8]:
PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
(except for Alexei and Maria) are discovered.
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
remainder of the story. 1883 Western Siberia,
a young Grigori Rasputin is asked by his father and a group of men to perform magic.
Rasputin has a vision and denounces one of the men as a horse thief. Although his
father initially slaps him for making such an accusation, Rasputin watches as the
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""

## Fonctions adjacentes



In [9]:
def set_seed(seed, n_gpu):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

La fonction `top_k_top_p_filtering` permet de fitlrer une distribution de *logits* en utilisant le filtrage top-k ou le filtrage *nucleus* top-p.

Arguments:
* *logits*: forme de la distribution des logits (taille de l'essai en cours (*batch size*) multipliée par la taille du vocabulaire)
* top_k > 0 :  garder uniquement les top k jetons ayant la plus forte probabilité (filtrage top-k)
* top_p > 0.0 : garder uniquement les top k jetons ayant une probabilité cumulative supérieure à top_p (filtrage par *nucleus*)

Le filtrage par *nucleus* est expliqué dans Holtzman et al. (http://arxiv.org/abs/1904.09751)

Pris dans: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317

In [10]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ 
    """
    top_k = min(top_k, logits.size(-1))  # Mesure de sécurité
    if top_k > 0:
        # Enlève tous les jetons ayant une probabilité inférieure au dernier jeton du top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
        # Enlève les jetons avec une probabilité cumulative supérieure au seuil.
        sorted_indices_to_remove = cumulative_probs > top_p
        # Déplace les jetons vers la droite afin d'inclure le premier jeton au-dessus du seuil.
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Déplace les tenseurs ainsi classés selon l'indice original.
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits

## fonction `sample_sequence`

In [12]:
def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,
                    device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context
    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            outputs = model(**inputs) # Note: on peut aussi utiliser 'past' avec GPT-2 (états cachés dans le cache)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # pénalité de répétition vient de CTRL (https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # échantillonnage avare:
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated

# Main function: `generate_text`

In [13]:
def generate_text(raw_text, 
                  prompt="", 
                  padding_text="", 
                  words=50, 
                  num_samples=1, 
                  temperature=1.0, 
                  repetition_penalty=1.0, 
                  top_k=0, 
                  top_p=0.9, 
                  no_cuda=False, 
                  seed=42, 
                  model_path="gpt2-medium"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()

    set_seed(1337, n_gpu)

    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    model.eval()

    context_tokens = tokenizer.encode(raw_text, add_special_tokens=False)
    out = sample_sequence(
        model=model,
        context=context_tokens,
        num_samples=num_samples,
        length=words,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        device=device,
    )
    out = out[:, len(context_tokens):].tolist()
    print("\n"+ raw_text)
    for o in out:
        text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
    print(text)

In [None]:
generate_text("slowly we rounded the corner, exclaiming to the dragon that we could if only we wanted ")

# Fine Tuning

In [15]:
import os
import pickle

from torch.utils.data import DataLoader, Dataset, RandomSampler                                                                                                                                                       

from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,                                                           
                                  GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)


Premièrement, nous allons définir notre propre fonction d'importation de données afin de pouvoir utiliser des textes de notre choix.

In [17]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        assert os.path.isfile(file_path)
        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(directory, 'cached_lm_' + str(block_size) + '_' + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, 'rb') as handle:
                self.examples = pickle.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text)-block_size+1, block_size): # Découpe en blocs d'une grandeur block_size
                self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i+block_size]))
            # Ici, nous perdons le dernier exemple découpé par soucis de simplicité (pas de padding)
            # Si votre jeu de données est petit, commencez par tenter d'en trouver un plus grand :)
            # et ensuite, vous pouvez changer ce comportement en ajoutant du padding spécifique au modèle.

            print("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, 'wb') as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])


def load_and_cache_examples(train_data_file, tokenizer, block_size):
    print("Loading dataset %s", train_data_file)
    dataset = TextDataset(tokenizer, file_path=train_data_file, block_size=block_size)
    return dataset


## Routine d'apprentissage

Cette partie est responsable de faire chacune des petites mises-à-jour du modèle qui se base sur le jeu de données.

In [21]:
def train(train_dataset, model, tokenizer, batch_size, max_datapoints, gradient_accumulation_steps, learning_rate, 
          weight_decay, adam_epsilon, max_grad_norm, num_train_epochs, warmup_steps, n_gpu, device):
    """ Train the model """
    train_batch_size = batch_size * max(1, n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)

    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

    # apprentissage sur de multiples gpus
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # afin d'améliorer la mémoire et la vitesse, réaliser l'apprentissage uniquement sur la tête du modèle.
    for param in model.parameters():
      param.requires_grad = False

    for param in model.lm_head.parameters():
      param.requires_grad = True

    print("Num examples = ", len(train_dataset), "\nNum Epochs = ", num_train_epochs, 
        "\nInstantaneous batch size per GPU = ", batch_size, 
        "\nTotal train batch size (w. parallel & accumulation) = ", train_batch_size * gradient_accumulation_steps ,
        "\nGradient Accumulation steps = ", gradient_accumulation_steps,
        "\nTotal optimization steps = ", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(num_train_epochs), desc="Epoch")
    set_seed(1337, n_gpu)  # ajouter ici pour pouvoir reproduire le processus (même entre python2 et 3)
    model.train()
    for _ in train_iterator:
        epoch_iterator = tqdm(list(zip(range(max_datapoints), train_dataloader)), desc="Iteration", position=0, leave=True)
        for step, batch in epoch_iterator:
            inputs, labels = (batch, batch)
            if str(device) != "cpu":
              inputs = inputs.to(device)
              labels = labels.to(device)
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # les sorties d'un modèle sont toujours des tuples dans transformer (voir la doc)

            if n_gpu > 1:
                loss = loss.mean()  # mean() afin de calculer la moyenne lorsqu'on utilise plusieurs gpus
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps
            
            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

    return global_step, tr_loss / global_step


## Run the finetuning
This is our main method, like above, but there are two steps, the first being to run the finetuning, and the second to generate the text from that fine-tuned model

In [19]:
def finetune(train_data_file, output_dir, model_path="gpt2-medium", block_size=512, do_lower_case=True,
             batch_size=1, max_datapoints=200, gradient_accumulation_steps=5, learning_rate=3e-4, 
             weight_decay=0.0, adam_epsilon=1e-8, max_grad_norm=1.0, num_train_epochs=1.0, warmup_steps=0):

    if os.path.exists(output_dir) and os.listdir(output_dir):
        print("WARNING: Output directory ({}) already exists and is not empty. Overwriting.".format(output_dir))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    device = device
    print("device: %s, n_gpu: %s", device, n_gpu)
    set_seed(42, n_gpu)

    config = GPT2Config.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path, do_lower_case=do_lower_case)
    block_size = min(block_size, tokenizer.max_len_single_sentence)
    model = GPT2LMHeadModel.from_pretrained(model_path, config=config)
    model.to(device)

    train_dataset = load_and_cache_examples(train_data_file, tokenizer, block_size)
    global_step, tr_loss = train(train_dataset, model, tokenizer, batch_size, max_datapoints, gradient_accumulation_steps, learning_rate, 
                                 weight_decay, adam_epsilon, max_grad_norm, num_train_epochs, warmup_steps, n_gpu, device)
    print(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Pratiques préférées: si vous utilisez save_pretained pour le modèle et le
    # tokenizer, vous pouvez les recharger en utiliser pretrained()
    # Crée un répertoire de sortie si nécessaire
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model checkpoint to %s", output_dir)
    # sauvegarde un modèle entraîné, sa configuration et le tokenizer avec
    # `save_pretrained()`.
    # On peut les retrouver avec `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)


In [None]:
finetune(train_data_file="tiny-shakespeare.txt", output_dir="./output")

In [None]:
generate_text("Why, man, he doth bestride the narrow world ", model_path="./output")