#Script Generation

In [None]:
#!g2.mig
import helper
data_dir = 'film_text.txt'
text = helper.load_data(data_dir)

In [None]:
#!g2.mig
view_line_range = (0, 10)
import numpy as np

print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in text.split()})))

lines = text.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))

print()
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

Dataset Stats
Roughly the number of unique words: 286276
Number of lines: 2081266
Average number of words in each line: 2.922427022783248

The lines 0 to 10:
INT. HOUSE - CLOSED EYES                                                                                

A young man's blue eyes slowly open.  A girl moans from the next room.


EXT. STREET CORNER - A LARGE TIRE

turns the corner and splashes through a puddle from an
earlier rain. TUPAC SHAKUR blares from inside.



---
## Implement Pre-processing Functions


In [None]:
#!g2.mig
import problem_unittests as tests
from collections import Counter

def create_lookup_tables(text):
    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    return (vocab_to_int, int_to_vocab)

tests.test_create_lookup_tables(create_lookup_tables)

Tests Passed


### Tokenize Punctuation


In [None]:
#!g2.mig
def token_lookup():
    token_dict = dict()
    token_dict['.'] = "||PERIOD||"
    token_dict[','] = "||COMMA||"
    token_dict['"'] = "||QUOTATION_MARK||"
    token_dict[';'] = "||SEMICOLON||"
    token_dict['!'] = "||EXCLAMATION_MARK||"
    token_dict['?'] = "||QUESTION_MARK||"
    token_dict['('] = "||LEFT_PAREN||"
    token_dict[')'] = "||RIGHT_PAREN||"
    token_dict['-'] = "||DASH||"
    token_dict['\n'] = "||RETURN||"
    return token_dict

tests.test_tokenize(token_lookup)

Tests Passed


In [None]:
#!g2.mig
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

In [None]:
#!g2.mig
import helper
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

## Build the Neural Network


In [None]:
#!g2.mig
import torch
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

In [None]:
#!g2.mig
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):

    n_batches = len(words)//batch_size
    words = words[:n_batches*batch_size]
    
    features, target = [], []
    for idx in range(0, (len(words) - sequence_length)):
        features.append(words[idx : idx + sequence_length])
        target.append(words[idx + sequence_length])
        
    feature_tensor = torch.from_numpy(np.asarray(features))
    target_tensor = torch.from_numpy(np.asarray(target))
    
    data = TensorDataset(feature_tensor, target_tensor)
    data_loader = DataLoader(data, batch_size=batch_size, shuffle=True)
    return data_loader

In [None]:
#!g2.mig
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = next(data_iter)

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[33, 34, 35, 36, 37],
        [32, 33, 34, 35, 36],
        [28, 29, 30, 31, 32],
        [ 4,  5,  6,  7,  8],
        [40, 41, 42, 43, 44],
        [ 8,  9, 10, 11, 12],
        [23, 24, 25, 26, 27],
        [10, 11, 12, 13, 14],
        [12, 13, 14, 15, 16],
        [43, 44, 45, 46, 47]])

torch.Size([10])
tensor([38, 37, 33,  9, 45, 13, 28, 15, 17, 48])


In [None]:
#!g2.mig
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNN, self).__init__()
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=dropout, batch_first=True)

        self.dropout = nn.Dropout(0.25)

        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
    
    
    def forward(self, nn_input, hidden):  
        batch_size = nn_input.size(0)

        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        out = self.dropout(lstm_out)
        out = self.fc(out)
        output = out.view(batch_size, -1, self.output_size)
        out = output[:, -1]
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
            
        return hidden

tests.test_rnn(RNN, train_on_gpu)

Tests Passed


In [None]:
#!g2.mig
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    clip=5
    if train_on_gpu:
        rnn.cuda()
        inp, target = inp.cuda(), target.cuda()
    h = tuple([each.data for each in hidden])
    rnn.zero_grad()
    output, h = rnn(inp, h)
    loss = criterion(output, target)
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), clip)
    optimizer.step()
    return loss.item(), h

tests.test_forward_back_prop(RNN, forward_back_prop, train_on_gpu)

Tests Passed


## Neural Network Training

### Hyperparameters

Set and train the neural network with the following parameters:
- Set `sequence_length` to the length of a sequence.
- Set `batch_size` to the batch size.
- Set `num_epochs` to the number of epochs to train for.
- Set `learning_rate` to the learning rate for an Adam optimizer.
- Set `vocab_size` to the number of uniqe tokens in our vocabulary.
- Set `output_size` to the desired size of the output.
- Set `embedding_dim` to the embedding dimension; smaller than the vocab_size.
- Set `hidden_dim` to the hidden dimension of your RNN.
- Set `n_layers` to the number of layers/cells in your RNN.
- Set `show_every_n_batches` to the number of batches at which the neural network should print progress.

If the network isn't getting the desired results, tweak these parameters and/or the layers in the `RNN` class.

In [None]:
#!g2.mig
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):

        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            batch_losses.append(loss)

            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    return rnn

In [None]:
#!g2.mig
sequence_length = 12
batch_size = 128
train_loader = batch_data(int_text, sequence_length, batch_size)
num_epochs = 2
learning_rate = 0.001
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 300
hidden_dim = 512
n_layers = 2
show_every_n_batches = 1000

### Train

In [None]:
#!g1.1
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

helper.save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 2 epoch(s)...
Epoch:    1/2     Loss: 5.73177996635437

Epoch:    1/2     Loss: 5.160752718687058

Epoch:    1/2     Loss: 5.009104348421097

Epoch:    1/2     Loss: 4.91757381439209

Epoch:    1/2     Loss: 4.828801354408264

Epoch:    1/2     Loss: 4.781493105173111

Epoch:    1/2     Loss: 4.7553358438014985

Epoch:    1/2     Loss: 4.722130950927735

Epoch:    1/2     Loss: 4.661684648275376

Epoch:    1/2     Loss: 4.653225932121277

Epoch:    1/2     Loss: 4.646141780614853

Epoch:    1/2     Loss: 4.620412744760514

Epoch:    1/2     Loss: 4.591969912052154

Epoch:    1/2     Loss: 4.5877769069671634

Epoch:    1/2     Loss: 4.574117929935455

Epoch:    1/2     Loss: 4.537620231628418

Epoch:    1/2     Loss: 4.548403714418411

Epoch:    1/2     Loss: 4.538714349508285

Epoch:    1/2     Loss: 4.5360903034210205

Epoch:    1/2     Loss: 4.524991992950439

Epoch:    1/2     Loss: 4.517101494550705

Epoch:    1/2     Loss: 4.508823357343673

Epoch:    1/2     Loss: 4.

In [None]:
#!g2.mig
import torch
import helper
import problem_unittests as tests

_, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()
trained_rnn = helper.load_model('./save/trained_rnn')

## Generate TV Script


In [None]:
#!g2.mig
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    rnn.eval()
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        hidden = rnn.init_hidden(current_seq.size(0))

        output, _ = rnn(current_seq, hidden)

        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu()

        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()

        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        word = int_to_vocab[word_i]
        predicted.append(word)     

        current_seq = np.roll(current_seq.cpu(), -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    return gen_sentences

In [None]:
#!g2.mig
gen_length = 2000
prime_word = 'jeff'

pad_word = helper.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)

jeff:
hair.

he walks to the door and looks back at his.


cut to:








































































































continued:







int. bathroom- night




int. the house

the two men sit on the edge of the stairs, staring at him.


int. hotel lobby- day










int. the car- night(flashback)

the door is a dark.










(continued)











































int. kitchen- night



int. hotel room, kitchen

as they walk through the crowd of the house of
the door.


(continued)









continued:


int. the room, night

the camera pans on a few feet in the doorway,

he turns to the other. she looks at him,


(continued)










continued:



ext. a house- night

a large black man is standing at the door, and
she sees the woman, and his wife


(more)










ext. gotham street- night

the car is in the dark, the sun is a
little...


ext. highway- day- day- later-










int. dirigible.

the camera pans 

#### Save your favorite scripts

Once you have a script that you like (or find interesting), save it to a text file!

In [None]:
#!g2.mig
# save script to a text file
f = open("generated_script_1.txt","w")
f.write(generated_script)
f.close()

In [None]:
#!g2.mig
