## Loading Data

In [75]:
import os
import numpy as np
input_file = os.path.join("data/owlcity.txt")
with open(input_file, "r") as f:
    lyrics_data = f.read()

In [61]:
lines = lyrics_data.split('\n')
print('Number of lines: {}'.format(len(lines)))
word_count_line = [len(line.split()) for line in lines]
print('Average number of words in each line: {}'.format(np.average(word_count_line)))


Number of lines: 6131
Average number of words in each line: 5.610177785026912


### lookup table

In [76]:
def lookup_tables(text):
    vocab = set(text)
    vocab_to_int = {word: i for i, word in enumerate(vocab)}
    int_to_vocab = {i: word for i, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab

In [77]:
def token_lookup():
    return { '.': '||Period||',
             ',': '||Comma||',
             '"': '||Quotation_Mark||',
             ';': '||Semicolon||',
             '!': '||Exclamation_Mark||',
             '?': '||Question_Mark||',
             '(': '||Left_Parentheses||',
             ')': '||Right_Parentheses||',
             '--': '||Dash||',
             '\n': '||Return||',
             '#': '||Hash||',
             '$': '||Dollar||',
             '%': '||Percent||',
             '&': '||Ampersand||',
             "'": '||Single_Quotation_Mark||',
             '*': '||Asterisk||',
             '+': '||Plus||',
             '/': '||Slash||',
             '\\': '||Backslash||',
             ':': '||Colon||',
             '[': '||Left_Bracket||',
             ']': '||Right_Bracket||',
             '{': '||Left_Curly_Braces||',
             '{': '||Right_Curly_Braces||',
             '<': '||Left_Arrow_Bracket||',
             '>': '||Right_Arrow_Bracket||',
             '=': '||Equal||',
             '@': '||At||',
             '^': '||Caret||',
             '`': '||Accent_Mark||',
             '~': '||Tilda||',
           }

In [78]:
def preprocess_data(text, token_lookup, create_lookup_tables):
    """
    Preprocess Text Data
    """

    token_dict = token_lookup()
    for key, token in token_dict.items():
        text = text.replace(key, ' {} '.format(token))

    text = text.lower()
    text = text.split()

    vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
    int_text = [vocab_to_int[word] for word in text]
    return int_text, vocab_to_int, int_to_vocab, token_dict

In [79]:
int_text, vocab_to_int, int_to_vocab, token_dict = preprocess_data(lyrics_data, token_lookup, lookup_tables)

# Neural Network

In [67]:
import torch

# Check for a GPU
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

No GPU found. Please use a GPU to train your neural network.


In [96]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size):
    
    n_batches = len(words)//batch_size
    words = words[:n_batches*batch_size]
    y_len = len(words) - sequence_length
    
    x, y = [], []
    for idx in range(0, y_len):
        idx_end = sequence_length + idx
        x_batch = words[idx:idx_end]
        x.append(x_batch)
        batch_y =  words[idx_end]
        y.append(batch_y)    
    
    # create Tensor datasets
    data = TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    # make sure the SHUFFLE your training data
    data_loader = DataLoader(data, shuffle=False, batch_size=batch_size)
    # return a dataloader
    return data_loader 

In [97]:
test_text = range(50)
t_loader = batch_data(test_text, sequence_length=5, batch_size=10)

data_iter = iter(t_loader)
sample_x, sample_y = data_iter.next()

print(sample_x.shape)
print(sample_x)
print()
print(sample_y.shape)
print(sample_y)

torch.Size([10, 5])
tensor([[ 0,  1,  2,  3,  4],
        [ 1,  2,  3,  4,  5],
        [ 2,  3,  4,  5,  6],
        [ 3,  4,  5,  6,  7],
        [ 4,  5,  6,  7,  8],
        [ 5,  6,  7,  8,  9],
        [ 6,  7,  8,  9, 10],
        [ 7,  8,  9, 10, 11],
        [ 8,  9, 10, 11, 12],
        [ 9, 10, 11, 12, 13]])

torch.Size([10])
tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14])


In [98]:
import torch
import torch.nn as nn

In [99]:
class RNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNN, self).__init__()
        
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        ## embeding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, nn_input, hidden):
        
        batch_size = nn_input.size(0)
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        ## stacking LSTM output
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.fc(lstm_out)
        
        out = out.view(batch_size, -1, self.output_size)
        # get last batch
        out = out[:, -1]
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
    
    

In [100]:
def forward_back_prop(rnn, optimizer, criterion, inputs, target, hidden):
    # move model to GPU, if available
    
    if(train_on_gpu):
        rnn.cuda()
        
#     # Creating new variables for the hidden state, otherwise
#     # we'd backprop through the entire training history
    h = tuple([each.data for each in hidden])

    # zero accumulated gradients
    rnn.zero_grad()
    
    if(train_on_gpu):
        inputs, target = torch.tensor(train).to(torch.int64), target.cuda()

    
    # get predicted outputs
    output, h = rnn(inputs, h)
    
    # calculate loss
    loss = criterion(output, target)
    
    loss.backward()
    
    # 'clip_grad_norm' helps prevent the exploding gradient problem in RNNs / LSTMs
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)

    optimizer.step()
    return loss.item(), h

In [101]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        # initialize hidden state
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # make sure you iterate over completely full batches, only
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    # returns a trained rnn
    return rnn

In [102]:
# Data params
# Sequence Length
sequence_length =  10 # of words in a sequence
# Batch Size
batch_size = 128

# data loader - do not change
train_loader = batch_data(int_text, sequence_length, batch_size)

In [103]:
# Training parameters
# Number of Epochs
num_epochs = 10
# Learning Rate
learning_rate = 0.001

# Model parameters
# Vocab size
vocab_size = len(vocab_to_int)
# Output size
output_size = vocab_size
# Embedding Dimension
embedding_dim = 200
# Hidden Dimension
hidden_dim = 250
# Number of RNN Layers
n_layers = 2

# Show stats for every n number of batches
show_every_n_batches = 200

print(len(vocab_to_int))

3319


In [104]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()

# defining loss and optimization functions for training
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

# saving the trained model
helper.save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')

Training for 10 epoch(s)...


KeyboardInterrupt: 