In [None]:
# First upload the training and evaluation files to this runtime (Press connect if needed)
!pip install transformers torch



In [None]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    GPT2LMHeadModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

# Setup logging
logger = logging.getLogger(__name__)

# Get access to model types and model configs to select GPT2 model and config
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)



In [None]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
        },
    )
    model_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "If training from scratch, pass a model type from the list: "
            + ", ".join(MODEL_TYPES)
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where do you want to store the pretrained models downloaded from s3"
        },
    )


In [None]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."
        },
    )
    line_by_line: bool = field(
        default=False,
        metadata={
            "help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."
        },
    )

    mlm: bool = field(
        default=False,
        metadata={
            "help": "Train with masked-language modeling loss instead of language modeling."
        },
    )

    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization."
            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
    overwrite_cache: bool = field(
        default=False,
        metadata={"help": "Overwrite the cached training and evaluation sets"},
    )



In [None]:
# Create LineByLineDataset from Movie Plots text file
def get_dataset(
    args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size
        )
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
        )


In [None]:
def main():

    model_args = ModelArguments(
        model_name_or_path="gpt2", model_type="gpt2"
    )
    data_args = DataTrainingArguments(
        train_data_file="6_genre_clean_training_data.txt",
        eval_data_file="6_genre_eval_data.txt",
        line_by_line=True,
        block_size=512,
        overwrite_cache=True,
    )
    training_args = TrainingArguments(
        output_dir="story_generator_checkpoint",
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,

        logging_steps=500,
        per_device_train_batch_size=4,
        num_train_epochs=3,
        save_total_limit=1,
        save_steps=1000,
    )

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed for deterministic training runs
    set_seed(training_args.seed)


    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, cache_dir=model_args.cache_dir
    )

    model = GPT2LMHeadModel.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    special_tokens_dict = {
        "bos_token": "<BOS>",
        "eos_token": "<EOS>",
        "pad_token": "<PAD>",
        "additional_special_tokens": [
            "<superhero>",
            "<action>",
            "<drama>",
            "<thriller>",
            "<horror>",
            "<sci_fi>",
        ],
    }

    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
      # If block_size <= 0, set it to max. possible value allowed by model
        data_args.block_size = model.config.max_position_embeddings
        #data_args.block_size = tokenizer.max_len
    else:
        #data_args.block_size = min(data_args.block_size, tokenizer.max_len)
        data_args.block_size = min(data_args.block_size, model.config.max_position_embeddings)


    # Get datasets

    train_dataset = (
        get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
    )
    eval_dataset = (
        get_dataset(data_args, tokenizer=tokenizer, evaluate=True)
        if training_args.do_eval
        else None
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=None,
    )

    # Training
    try:
      if training_args.do_train:
          model_path = (
              model_args.model_name_or_path
              if model_args.model_name_or_path is not None
              and os.path.isdir(model_args.model_name_or_path)
              else None
          )
          trainer.train(model_path=model_path)
          trainer.save_model()
          tokenizer.save_pretrained(training_args.output_dir)
    except KeyboardInterrupt:
      print("Saving model that was in the middle of training")
      trainer.save_model()
      tokenizer.save_pretrained(training_args.output_dir)
      return

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
        '''if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))'''
        if trainer.args.local_rank in [-1, 0]:
          with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
              logger.info("  %s = %s", key, str(result[key]))
              writer.write("%s = %s\n" % (key, str(result[key])))


        results.update(result)

    return results


In [None]:
#!pip uninstall transformers accelerate
#!pip install transformers[torch]


In [None]:
# Press the Run Cell button to the left to start training
if __name__ == "__main__":
    main()
# To stop training and save model, press the same Run Cell button (now, it is the Interrupt Execution button)



Step,Training Loss
500,4.0232


In [None]:
# This cell is to style the Google Colab's output properly (Just blindly run this)
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Run these cells for story generation
from transformers import pipeline, TextGenerationPipeline, GPT2LMHeadModel, AutoTokenizer

"""
Below, my model checkpoint is commented out. You can replace your checkpoint
with that to test story generation if your checkpoint didn't train for long enough
"""
#checkpoint = "pranavpsv/gpt2-genre-story-generator"
checkpoint = "story_generator_checkpoint"

model = GPT2LMHeadModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
story_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
# The format for input_prompt: "<BOS> <genre> Optional text..."
# Supported genres: superhero, sci_fi, horror, thriller, action, drama

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
input_prompt = "<BOS> <horror>"
story = story_generator(input_prompt, max_length=400, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(story)

[{'generated_text': '<BOS> <horror> The next film in this trilogy, Fafnir and Venerable are locked away on the island of Kamahun by a man called King Jormok. Karmichaelasamy (Vinoy Dutta), who is known only as Sohriyoshi ("The Man"), takes command along with his servants to find out where all their hostages have disappeared off shore just now—but he thinks it\'s nowhere near home: In some corners nearby there remains much mystery about whether they had anything better for them than leaving one person dead after another from malaria; also unknown outside here lurks how long the voyage lasted without someone finding food because life has been lost since then! During an unsuccessful search into that area both families agree but do not know their names or identities until late August through early October 1945 when Rani Gurdonai arrives alone at port while Nariman Singh sets up camp inside the seaway overlooking Kathmandu Bay once again before midnight so everyone can get ready if possible

In [None]:
input_prompt = "<BOS> <action>"
story = story_generator(input_prompt, max_length=400, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(story)

[{'generated_text': '<BOS> <action> The film opens in New York City. A young reporter named Mark Thorne (David Crane) goes to a radio broadcast for an expose on how the country revolves, having been forced into political asylum by President John Fink of France after World War I and has lost many friends including members-in prison he helped escape via railroad crossing during this time without any incident since entering US military service as commander from 1935 until 1940 when his family went undersecretary at Rolleo Ghetto Nationalities before returning home unharmed but still lacking funds despite repeated pleas between men not do what they are commanded – write that some illegal immigrants become "the masters". They end up with little power except control over their own destiny within one set system - government while it is owned both privately controlled states like England where everyone was ruled strictly by each other or America through corporate rule which controls everything

In [None]:
input_prompt = "<BOS> <drama>"
story = story_generator(input_prompt, max_length=400, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(story)



In [None]:
input_prompt = "<BOS> <sci_fi>"
story = story_generator(input_prompt, max_length=400, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(story)

[{'generated_text': '<BOS> <sci_fi> Purnima\'s daughter and son Anandaras have two daughters at the same age that are under-led. One of her father has been brought up to marry some wealthy merchant Bhaskaran, who had taken advantage by taking away Amrita Devi (Venu Padtha) without permission from their parents as well; thus she must inherit half NCP funds when they reach this point—in an attempt not only for Chitra but also with hopes in acquiring more rights within Rameshwaram which was on loan till 2014 so he could buy property there if possible through other means." As per instructions given earlier(i), his land would come under control all along after obtaining proper permits first received from Chandniya Vidyarthaja Devsuryayukrishna Pradhaniah (Virishalaya Samani Javed). His new owner should do business in India once a year rather than having it carried out in Bangalore or Delhi daily while receiving such loans from different lenders because no one in Rajaji City can lend anythin

In [None]:
input_prompt = "<BOS> <thriller>"
story = story_generator(input_prompt, max_length=400, do_sample=True,
               repetition_penalty=1.1, temperature=1.2,
               top_p=0.95, top_k=50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print(story)

[{'generated_text': '<BOS> <thriller> Anastasia is a widower that has to deal with losing her husband, now the leader of Punditry as well. She wants revenge and saves his life when she feels guilty for causing it during The Bride Wars while giving him help. Her first marriage fails after their parents dies in college; but they survive together despite spending time apart (and going alone), having not come out at all before them falling under pressure from friends or even being called gangsters by others along behind bars - leading one girl-friend/puppet whom we eventually fall back into some kind relationship: Sankantu Vadipurramanagalakshavela – also known just as Ranganathacharya Chokshiva ("Rangambam") because he\'s an old friend who got married earlier on instead doing so illegally, becoming involved again with someone else later only months down below due interest." When I think about those two girls coming closer and marrying up close," wrote Maithran Devi in InnaVatni Bhargava S

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Save the final model after training and evaluation to Google Drive
model.save_pretrained("/content/drive/MyDrive/ColabNotebook/Story_generation_model")
tokenizer.save_pretrained("/content/drive/MyDrive/ColabNotebook/Story_generation_model")


('/content/drive/MyDrive/ColabNotebook/Story_generation_model/tokenizer_config.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/special_tokens_map.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/vocab.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/merges.txt',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/added_tokens.json',
 '/content/drive/MyDrive/ColabNotebook/Story_generation_model/tokenizer.json')