# Star Trek Script Generator

Based on [script_generation.ipynb](https://github.com/cdpierse/script_buddy_v2/blob/master/script_buddy/script_generation.ipynb)

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelWithLMHead
import numpy as np
import os
import random
from datetime import datetime
from IPython.display import clear_output
from time import sleep
from zipfile import ZipFile
import subprocess
from utils import *

In [2]:
os.chdir("/kaggle/working") # Make sure we're in base directory

## User Settings

In [3]:
# How many episodes to train on? If 0, assume all episodes
EP_COUNT = 0

# How many times to cycle through all episodes in training?
EPOCHS = 1

# How many batches to run together?
BATCH_SIZE = 1

# Testing string. Trekbot starts with this to create a new script
SAMPLE_STRING = "Picard to Riker"

# Sample length for samples generated from model
SAMPLE_LENGTH = 500

# What text should we train on? Default is Star Trek episodes
# TRAINING_FILE_PATH = os.path.join("..", "input", "trekbot", "film_text.txt")

## Check Input File

Uncomment the below lines to see the first few lines of your input file

In [4]:
# with open(TRAINING_FILE_PATH, "r") as file:
#     print(file.read()[0:400])

### End User Settings

In [5]:
MODEL_NAME = 'trekbot'
MODEL_DIR = os.path.join("..", "input", "trekbot-model")

FILENAME_SUFFIX = str(datetime.now())[:10]

# Filename for output scripts
SCRIPT_DIR = f"scripts/"
SCRIPT_FILENAME = f"scripts-{FILENAME_SUFFIX}.txt"

# Directory to save model
OUTPUT_MODEL_DIR = f"models/trekbot"
OUTPUT_MODEL_FILENAME = f"model.txt"

# This variable already used in code later
output_dir = OUTPUT_MODEL_DIR

In [6]:
dirs = ['models', 'scripts', SCRIPT_DIR, 'samples', 'diagnostics']

Setup.dir_setup(dirs) # Setup directory structure

Created models
Created scripts
Skipped scripts/. It already exists
Created samples
Created diagnostics


In [7]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [8]:
print(f"Using model {MODEL_NAME} from {MODEL_DIR}")
model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)

Using model trekbot from ../input/trekbot-model


## Load Model

In [9]:
model = model.to(device)

## Load Dataset

It's already pickled for fast loading

In [10]:
import pickle
pickle_path = os.path.join("..", "input", "trekbot", "gpt2_trekbot.txt")
dataset = pickle.load(open(pickle_path, "rb"))
print(f"Dataset has {len(dataset)} scripts total")

Dataset has 13051 scripts total


## Set episode count

For testing the whole pipeline works, I'm just running it with a few episodes to start with

In [11]:
if EP_COUNT != 0:
    dataset = dataset[:EP_COUNT]

In [12]:
script_loader = DataLoader(dataset,batch_size=1,shuffle=True)
if EP_COUNT != 0:
    print(f"Loaded {EP_COUNT} scripts from dataset")
else:
    print("Loaded all scripts from dataset")

Loaded all scripts from dataset


In [13]:
LEARNING_RATE = 0.00006 # Faster uses more GPU?
WARMUP_STEPS = 10000

In [14]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
script_count = 0
sum_loss = 0.0
batch_count = 0

## Train Model

In [15]:
total = len(dataset) # number of items in dataset

In [16]:
from math import floor

SAMPLE_INTERVAL = 2000
total_sample_count = floor(total/SAMPLE_INTERVAL)
print(f"Total samples: {total_sample_count}")

Total samples: 6


In [17]:
start_time = datetime.now().strftime("%H:%M:%S")
print(f"Start: {start_time}")

Start: 00:39:49


In [18]:
# Setup diagnostics
losses = []
loss_log = "loss_rate.csv"

Diag.setup(loss_log)

Creating new log at /kaggle/working/diagnostics/loss_rate.csv


In [19]:
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_DIR)

In [20]:
os.chdir(SCRIPT_DIR)

In [21]:
# %%timeit
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch} started" + '=' * 30)
    for idx,script in enumerate(script_loader):
                
        outputs = model(script.to(device), labels=script.to(device))
        
        loss, logits = outputs[:2]
        timestamp = datetime.now().strftime("%H:%M:%S")
        
        loss.backward()

        sum_loss = sum_loss + loss.detach().data
                       
        script_count = script_count + 1
        if script_count == BATCH_SIZE:
            script_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()
        
        if batch_count == SAMPLE_INTERVAL:
            append_string = f"\t| Appending script to {SCRIPT_FILENAME}..."
        else:
            append_string = ''
        
        # Update output display
        clear_output(wait=True) # Clear and update display, otherwise endless scroll
        percent = round(idx/total, 5)*100
        rounded_loss = round(float(loss.detach().data), 4)
        rounded_sum_loss = round(float(sum_loss), 2)
        print(f"{timestamp}: Processing {idx}/{total} \t {percent}% \tLoss: {rounded_loss} | Sum loss: {rounded_sum_loss} {append_string}")
        if batch_count == SAMPLE_INTERVAL:
            print("Saving script")
            model.eval()
            losses.append(sum_loss)
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = SAMPLE_LENGTH,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )

            for i, sample_output in enumerate(sample_outputs):
                with open(SCRIPT_FILENAME, "a") as file:
                    file.write("\n\n")
                    file.write("*" * 80)
                    file.write("\n\n")
                    file.write(tokenizer.decode(sample_output, skip_special_tokens=True))
                    print(f"Script appended to {SCRIPT_FILENAME}")
            
            batch_count = 0
            sum_loss = 0.0
            model.train()



UnboundLocalError: local variable 'sum_loss' referenced before assignment

In [None]:
end_time = datetime.now().strftime("%H:%M:%S")
print(f"Start: {start_time}")
print(f"End: {end_time}")

## Save Trained Model

In [None]:
os.chdir("/kaggle/working")
os.mkdir(OUTPUT_MODEL_DIR)

from transformers import WEIGHTS_NAME, CONFIG_NAME
output_model_file = os.path.join(OUTPUT_MODEL_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_MODEL_DIR, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
print(f"Saved {output_model_file} to {OUTPUT_MODEL_DIR}")
model.config.to_json_file(output_config_file)
print(f"Saved {output_config_file} to {OUTPUT_MODEL_DIR}")
tokenizer.save_vocabulary(OUTPUT_MODEL_DIR)
print(f"Saved vocabulary to {OUTPUT_MODEL_DIR}")

## Test Trained Model

In [None]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)
print(f"Loaded model and tokenizer from {output_dir}")

In [None]:
sample_outputs = model.generate(
    bos_token_id=random.randint(1,30000),
    do_sample=True,   
    top_k=50, 
    max_length = 1000,
    top_p=0.95, 
    num_return_sequences=5
    )

In [None]:
# Generate samples
for i, output in enumerate(sample_outputs):
    filename = f'script_{i+1:03}.txt'
    file_path = f'samples/{filename}'
    content = tokenizer.decode(output, skip_special_tokens=True)
    with open(file_path, 'w') as file:
        file.write(content)
    print(f"{filename} written")
    

In [None]:
# Zip samples
os.chdir('samples')
with ZipFile('samples.zip', 'w') as zipObj:
    for filename in os.listdir():
        if not filename.endswith(".zip"):
            zipObj.write(filename)

print("samples.zip created")

In [None]:
print(f"Start loss {losses[0]}")
print(f"End loss {losses[-1]}")