In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# read in the lyrics text file
with open('./data/lyrics.txt', encoding='utf-8') as f:
    raw_lyrics = f.read()

In [3]:
# data statistics
lyrics_per_line = raw_lyrics.split('\n')
word_count_line = [len(line.split()) for line in lyrics_per_line]

print('Total number of lines:', len(raw_lyrics.split('\n')))
print('Total number of unique words (roughly):', len({word: None for word in raw_lyrics.split()}))
print('Average number of words in a line:', np.average(word_count_line))
print('The least number of words in a line:', np.min(word_count_line))
print('The most number of words in a line:', np.max(word_count_line))
print()

view_range = 20
print('Lyric preview:')
print('\n'.join(raw_lyrics.split('\n')[:view_range]))

Total number of lines: 1792
Total number of unique words (roughly): 1964
Average number of words in a line: 3.763392857142857
The least number of words in a line: 0
The most number of words in a line: 13

Lyric preview:
Have you ever seen anything?
아름다운 색, 아름다운 색, 아름다운 색
Have you ever seen this color?
아름다운 색, 아름다운 다운 다운 다운
Have you ever seen anything?
아름다운 색, 아름다운 색, 아름다운 색
Have you ever seen this color?
아름다운 색, 아름다운 다운 다운 다운

끌리네 그 누구와도 다르게
변하고 싶어 나
너를 바라보면서 yeah
너를 알아가면서 yeah

상상이 내 감정을 더 움직여
열두 가지 색색깔의 무지개
나는 과연 어떤 색일까
우리 더 빛나게 해볼까

천천히 하나 둘 그리는 하얀 종이 위에


In [4]:
from string import punctuation

# check which punctuations do the lyrics have
def check_punctuations(lyrics):
    flag = False
    punct_list = []
    for p in punctuation:
        if raw_lyrics.find(p) != -1:
            flag = True
            punct_list.append(p)

    return (flag, punct_list)

In [5]:
check_punctuations(raw_lyrics)

(True, ['!', "'", '(', ')', ',', '-', '/', '?'])

In [6]:
from collections import Counter

def create_lookup_tables(lyrics):
    word_count = Counter(lyrics)
    sorted_word_count = sorted(word_count, key=word_count.get, reverse=True)
    vocab_to_int = {word: idx for idx, word in enumerate(sorted_word_count)}
    int_to_vocab = {idx: word for word, idx in vocab_to_int.items()}
    
    return (vocab_to_int, int_to_vocab)

def create_token_lookup():
    punctuations = ['!', "'", '(', ')', ',', '-', '/', '?', '\n']
    tokens = ['<EXCLAMATION_MARK>', '<SINGLE_QUOTATION_MARK>', '<LEFT_ROUND_BRACKET>', '<RIGHT_ROUND_BRACKET>',
              '<COMMA>', '<HYPHEN>', '<SLASH>', '<QUESTION_MARK>', '<NEW_LINE>']
    
    punct_token = {}
    for p in range(len(punctuations)):
        punct_token[punctuations[p]] = tokens[p]
        
    return punct_token

In [7]:
# preprocess the data
token_lookup = create_token_lookup()
for symbol, token in token_lookup.items():
    raw_lyrics = raw_lyrics.replace(symbol, ' {} '.format(token))

tokenized_lyrics = raw_lyrics.lower()
tokenized_lyrics = tokenized_lyrics.split()
    
vocab_to_int, int_to_vocab = create_lookup_tables(tokenized_lyrics)
encoded_lyrics = [vocab_to_int[word] for word in tokenized_lyrics]

In [8]:
# check GPU availability
import torch

gpu_availability = torch.cuda.is_available()

if gpu_availability:
    print('GPU Available! Training on:', torch.cuda.get_device_name(0))
else:
    print('No GPU found! Training on CPU...')

GPU Available! Training on: GeForce MX150


In [9]:
# batching
from torch.utils.data import TensorDataset, DataLoader

def batch_lyric(lyrics, sequence_length, batch_size):
    features = []
    labels = []
    
    for w in range(len(lyrics)):
        if w+sequence_length < len(lyrics):
            features.append(lyrics[w:w+sequence_length])
            labels.append(lyrics[w+sequence_length])
    
    features = np.array(features)
    labels = np.array(labels)
    
    dataset = TensorDataset(torch.from_numpy(features), torch.from_numpy(labels))
    loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
    
    return loader

In [10]:
train_loader = batch_lyric(encoded_lyrics, sequence_length=5, batch_size=10)

train_iter = iter(train_loader)
f, l = train_iter.next()
print(f)
print(l)

tensor([[ 233, 1675,   89,    0,  688],
        [ 143,  808,  108,    0,    4],
        [   1,  761,   24,   99,    0],
        [  86,   60,  106,  198,    0],
        [   0,  483,  685,  136,   42],
        [  52,    0,   28,  390,  199],
        [   1,   13,    0,  646, 1219],
        [   0,  883,   77,  884,  885],
        [ 116,   38,   15,  372,  373],
        [   0,  289,  267,  352,  232]], dtype=torch.int32)
tensor([ 563,    1,  257,    0,  101,  262, 1220,    0,    0,  353],
       dtype=torch.int32)


In [12]:
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, num_layers, dropout):
        super(Model, self).__init__()
        
        # model hyperparameters
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        
        # model layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, n_input, hidden):
        batch = n_input.size(0)
        
        embed = self.embedding(n_input)
        l, hidden = self.lstm(embed, hidden)
        l = l.contiguous().view(-1, self.hidden_dim)
        
        out = self.fc(l)
        out = out.view(batch, -1, self.output_size)
        out = out[:,-1]
        
        return out, hidden
        
    def init_hidden(self, batch_size):
        w = next(self.parameters()).data
        
        if train_on_gpu:
            hidden = (w.new(self.num_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      w.new(self.num_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (w.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
                      w.new(self.num_layers, batch_size, self.hidden_dim).zero_())
            
        return hidden
        

In [14]:
vocab_size = len(vocab_to_int)
output_size = vocab_size
embedding_dim = 300
hidden_dim = 512
num_layers = 2
dropout = 0.5

model = Model(vocab_size, output_size, embedding_dim, hidden_dim, num_layers, dropout)
model.