In [9]:
# First upload the training and evaluation files to this runtime (Press connect if needed)
!pip install transformers torch



In [10]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    GPT2LMHeadModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

# Setup logging
logger = logging.getLogger(__name__)

# Get access to model types and model configs to select GPT2 model and config
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)



In [11]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
        },
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "If training from scratch, pass a model type from the list: "
            + ", ".join(MODEL_TYPES)
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where do you want to store the pretrained models downloaded from s3"
        },
    )


In [12]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
        },
    )
    line_by_line: bool = field(
        default=False,
        metadata={
            "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
        },
    )

    mlm: bool = field(
        default=False,
        metadata={
            "help": "Train with masked-language modeling loss instead of language modeling."
        },
    )

    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached training and evaluation sets"},
    )



In [13]:
# Create LineByLineDataset from Movie Plots text file
def get_dataset(
    args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size
        )
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
        )


In [14]:
def main():

    model_args = ModelArguments(
        model_name_or_path="gpt2", model_type="gpt2"
    )
    data_args = DataTrainingArguments(
        train_data_file="6_genre_clean_training_data.txt",
        eval_data_file="6_genre_eval_data.txt",
        line_by_line=True,
        block_size=512,
        overwrite_cache=True,
    )
    training_args = TrainingArguments(
        output_dir="story_generator_checkpoint",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,

        logging_steps=500,
        per_device_train_batch_size=4,
        num_train_epochs=3,
        save_total_limit=1,
        save_steps=1000,
    )

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed for deterministic training runs
    set_seed(training_args.seed)


    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir
    )

    model = GPT2LMHeadModel.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    special_tokens_dict = {
        "bos_token": "<BOS>",
        "eos_token": "<EOS>",
        "pad_token": "<PAD>",
        "additional_special_tokens": [
            "<superhero>",
            "<action>",
            "<drama>",
            "<thriller>",
            "<horror>",
            "<sci_fi>",
        ],
    }

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
      # If block_size <= 0, set it to max. possible value allowed by model
        data_args.block_size = model.config.max_position_embeddings
        #data_args.block_size = tokenizer.max_len
    else:
        #data_args.block_size = min(data_args.block_size, tokenizer.max_len)
        data_args.block_size = min(data_args.block_size, model.config.max_position_embeddings)


    # Get datasets

    train_dataset = (
        get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
    )
    eval_dataset = (
        get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
        if training_args.do_eval
        else None
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=None,
    )

    # Training
    try:
      if training_args.do_train:
          model_path = (
              model_args.model_name_or_path
              if model_args.model_name_or_path is not None
              and os.path.isdir(model_args.model_name_or_path)
              else None
          )
          trainer.train(model_path=model_path)
          trainer.save_model()
          tokenizer.save_pretrained(training_args.output_dir)
    except KeyboardInterrupt:
      print("Saving model that was in the middle of training")
      trainer.save_model()
      tokenizer.save_pretrained(training_args.output_dir)
      return

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        '''if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))'''
        if trainer.args.local_rank in [-1, 0]:
          with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
              logger.info("  %s = %s", key, str(result[key]))
              writer.write("%s = %s\n" % (key, str(result[key])))


        results.update(result)

    return results


In [15]:
#!pip uninstall transformers accelerate
#!pip install transformers[torch]


In [16]:
# Press the Run Cell button to the left to start training
if __name__ == "__main__":
    main()
# To stop training and save model, press the same Run Cell button (now, it is the Interrupt Execution button)



Step,Training Loss
500,4.0974


Step,Training Loss
500,4.0974
1000,3.278


In [17]:
# This cell is to style the Google Colab's output properly (Just blindly run this)
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [31]:
# Run these cells for story generation
from transformers import pipeline, TextGenerationPipeline, GPT2LMHeadModel, AutoTokenizer

"""
Below, my model checkpoint is commented out. You can replace your checkpoint
with that to test story generation if your checkpoint didn't train for long enough
"""
#checkpoint = "pranavpsv/gpt2-genre-story-generator"
checkpoint = "story_generator_checkpoint"

model = GPT2LMHeadModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
story_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
# The format for input_prompt: "<BOS> <genre> Optional text..."
# Supported genres: superhero, sci_fi, horror, thriller, action, drama

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [39]:
input_prompt = "<BOS> <horror>"
story = story_generator(input_prompt, max_length=100, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)
print("\n Story : \n",story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Story : 
 [{'generated_text': "<BOS> <horror> On a cold winter's evening in February, three strangers are brutally murdered on the street. Soon after their arrival at an abandoned school for gifted children who was once part of another notorious cult; two friends — Drs Nels and Pucket—come upon this as if it has been some kind family reunion event rather than just random shooting deaths that can't be fixed by any sane man (but maybe by Mrs Dummett). One student starts talking to himself about how he did"}]


In [40]:
input_prompt = "<BOS> <action>"
story = story_generator(input_prompt, max_length=100, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)
print("\n Story : \n",story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Story : 
 [{'generated_text': '<BOS> <action> In 1960, at their graduation ceremony in Latham County—an affluent rural area of Louisiana with an attractive population as well (including the recently revealed owner), who has run a dairy farm along Big River Highway near New Orleans to raise cattle. As part way from home and not entirely within sight that day on his morning walk across large landings they find what appear like four beautiful women riding horseback over riverbeds into town: Mary Louise McCollum; Mrs., Missy Jones'}]


In [41]:
input_prompt = "<BOS> <drama>"
story = story_generator(input_prompt, max_length=100, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)
print("\n Story : \n",story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Story : 
 [{'generated_text': "<BOS> <drama> In 1942, a newly minted Nazi doctor named Rudolf (Boruch) is trying to help an orphan murdered by German troops after his wife has been raped. In the meantime though there are rumors that Dr Bessie (Ruth Westhoff-Olney), her superior's mistress and also fellow physician who had performed similar services for the wounded widowed woman of death in her home town near Berlin on several occasions when she was pregnant then turned herself over earlier than usual because"}]


In [42]:
input_prompt = "<BOS> <sci_fi>"
story = story_generator(input_prompt, max_length=100, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)
print("\n Story : \n",story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Story : 
 [{'generated_text': "<BOS> <sci_fi> One day in 1970, as his family are preparing to retire before the election night. Joe arrives at a hotel suite and begins thinking about becoming CEO if he succeeds Mr Nixon or not; instead of going forward with another term until there's money running around by him then having nothing more than five seconds after hiring someone otherworldly for such reasons I'm afraid some people will have trouble believing they're in it so good while others try something different! After one person makes out her voice that way"}]


In [43]:
input_prompt = "<BOS> <thriller>"
story = story_generator(input_prompt, max_length=100, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)
print("\n Story : \n",story)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



 Story : 
 [{'generated_text': '<BOS> <thriller> The story unfolds as both protagonists are faced with the death of their childhood sweetheart, whom they do not know. They must figure out a way to stop this from happening again. While trying in vain for love he learns that some girl has died and was left completely blind by her blindness - leading them back into despair once more along paths they did choose never come across before (in time), culminating at one such night when their car breaks down. After leaving his office on an expressway'}]


In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
# Save the final model after training and evaluation to Google Drive
model.save_pretrained("/content/drive/MyDrive/ColabNotebook/Story_generation_model")
tokenizer.save_pretrained("/content/drive/MyDrive/ColabNotebook/Story_generation_model")

('/content/drive/MyDrive/ColabNotebook/Story_generation_model/tokenizer_config.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/special_tokens_map.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/vocab.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/merges.txt',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/added_tokens.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/tokenizer.json')