In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [11]:
# read in the lyrics text file
with open('./data/lyrics.txt', encoding='utf-8') as f:
    raw_lyrics = f.read()

In [12]:
# data statistics
lyrics_per_line = raw_lyrics.split('\n')
word_count_line = [len(line.split()) for line in lyrics_per_line]

print('Total number of lines:', len(raw_lyrics.split('\n')))
print('Total number of unique words (roughly):', len({word: None for word in raw_lyrics.split()}))
print('Average number of words in a line:', np.average(word_count_line))
print('The least number of words in a line:', np.min(word_count_line))
print('The most number of words in a line:', np.max(word_count_line))
print()

view_range = 20
print('Lyric preview:')
print('\n'.join(raw_lyrics.split('\n')[:view_range]))

Total number of lines: 1792
Total number of unique words (roughly): 1964
Average number of words in a line: 3.763392857142857
The least number of words in a line: 0
The most number of words in a line: 13

Lyric preview:
Have you ever seen anything?
아름다운 색, 아름다운 색, 아름다운 색
Have you ever seen this color?
아름다운 색, 아름다운 다운 다운 다운
Have you ever seen anything?
아름다운 색, 아름다운 색, 아름다운 색
Have you ever seen this color?
아름다운 색, 아름다운 다운 다운 다운

끌리네 그 누구와도 다르게
변하고 싶어 나
너를 바라보면서 yeah
너를 알아가면서 yeah

상상이 내 감정을 더 움직여
열두 가지 색색깔의 무지개
나는 과연 어떤 색일까
우리 더 빛나게 해볼까

천천히 하나 둘 그리는 하얀 종이 위에


In [13]:
from string import punctuation

# check which punctuations do the lyrics have
def check_punctuations(lyrics):
    flag = False
    punct_list = []
    for p in punctuation:
        if raw_lyrics.find(p) != -1:
            flag = True
            punct_list.append(p)

    return (flag, punct_list)

In [14]:
check_punctuations(raw_lyrics)

(True, ['!', "'", '(', ')', ',', '-', '/', '?'])

In [62]:
from collections import Counter

def create_lookup_tables(lyrics):
    word_count = Counter(lyrics)
    sorted_word_count = sorted(word_count, key=word_count.get, reverse=True)
    vocab_to_int = {word: idx for idx, word in enumerate(sorted_word_count)}
    int_to_vocab = {idx: word for word, idx in vocab_to_int.items()}
    
    return (vocab_to_int, int_to_vocab)

def create_token_lookup():
    punctuations = ['!', "'", '(', ')', ',', '-', '/', '?', '\n']
    tokens = ['<EXCLAMATION_MARK>', '<SINGLE_QUOTATION_MARK>', '<LEFT_ROUND_BRACKET>', '<RIGHT_ROUND_BRACKET>',
              '<COMMA>', '<HYPHEN>', '<SLASH>', '<QUESTION_MARK>', '<NEW_LINE>']
    
    punct_token = {}
    for p in range(len(punctuations)):
        punct_token[punctuations[p]] = tokens[p]
        
    return punct_token

In [71]:
# preprocess the data
token_lookup = create_token_lookup()
for symbol, token in token_lookup.items():
    raw_lyrics = raw_lyrics.replace(symbol, ' {} '.format(token))

tokenized_lyrics = raw_lyrics.lower()
tokenized_lyrics = tokenized_lyrics.split()
    
vocab_to_int, int_to_vocab = create_lookup_tables(tokenized_lyrics)
encoded_lyrics = [vocab_to_int[word] for word in tokenized_lyrics]

In [64]:
# check GPU availability
import torch

gpu_availability = torch.cuda.is_available()

if gpu_availability:
    print('GPU Available! Training on:', torch.cuda.get_device_name(0))
else:
    print('No GPU found! Training on CPU...')

GPU Available! Training on: GeForce MX150


In [65]:
# batching
from torch.utils.data import TensorDataset, DataLoader

def batch_lyric(lyrics, sequence_length, batch_size):
    features = []
    labels = []
    
    for w in range(len(lyrics)):
        if w+sequence_length < len(lyrics):
            features.append(lyrics[w:w+sequence_length])
            labels.append(lyrics[w+sequence_length])
    
    features = np.array(features)
    labels = np.array(labels)
    
    dataset = TensorDataset(torch.from_numpy(features), torch.from_numpy(labels))
    loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
    
    return loader

In [76]:
train_loader = batch_lyric(encoded_lyrics, sequence_length=5, batch_size=10)

train_iter = iter(train_loader)
f, l = train_iter.next()
print(f)
print(l)

tensor([[1186,    0,  684,  218,  165],
        [ 115,  178,  296,    0,   92],
        [   0,    0,   10,   96,   10],
        [ 368, 1401, 1402, 1403,  215],
        [1265,  716,   43,    0, 1266],
        [  82,   23,    7,    0,   45],
        [  80,    0,  154,  102,  112],
        [ 193,  475,  136,    0,  231],
        [1207, 1208,    2,   13,    3],
        [   2,  331,   19,    3,    0]], dtype=torch.int32)
tensor([313, 560,  10, 108,  45, 335,   0,  55,   0,  20], dtype=torch.int32)
