In [1]:
import pandas as pd
import numpy as np
import re
import random
import sys

from keras import layers
from keras.models import Sequential
from keras import optimizers

Using TensorFlow backend.


# Preview

In [2]:
#Check for GPU Acceleration and stuff
import tensorflow as tf
print(tf.test.gpu_device_name())
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

/device:GPU:0


In [3]:
#Processing of data
import json
with open('clean_dataset.json','r') as f:
    training_data = json.loads(f.read()) #reminder that training_data is a huge ass string

In [4]:
# print(training_data)

# Training

In [14]:
num_vals = 14 # we use 0 to 13
def vectorizing_seq (text, maxlen, step):    
    """
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a character to its integer placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_chars = [] # hold next characters for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))
    print('Vectorization on ({}, {}, {}) dims...'.format(len(sentences), maxlen, num_vals)) 
    #POL WHAT THE FUCK IS THIS ABOVE LINE??? WHY WOULD YOU PUT A PARAMETER OUTSIDE OF THE FUNCTION
    #ARGUMENTS HERE
    
    # one hot encoding the characters into binary arrays
    x = np.zeros((len(sentences), maxlen, num_vals), dtype=np.bool)
    y = np.zeros((len(sentences), num_vals), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, val in enumerate(sentence):
            x[i, t, val] = 1
        y[i, next_chars[i]] = 1
        
    return x, y

In [6]:
def create_model(x, y, maxlen, epochs):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

#     model = Sequential()
#     model.add(layers.GRU(
#         32,
#         return_sequences=True,
#         input_shape=(maxlen, len(chars)))
#     )
#     model.add(layers.GRU(
#         64,
#         input_shape=(maxlen, len(chars)))
#     )
#     model.add(layers.Dense(
#         len(chars), 
#         activation='softmax')
#     )

    # start of my model attempt, it works decently well
    # - try commenting it out and using the previous one
    # - also try removing the Dropout layers
    # --------------------------------------------
    model = Sequential()
    model.add(layers.GRU(
        256,
        return_sequences=True,
        input_shape=(maxlen, num_vals)
    ))
    model.add(layers.Dropout(0.5))
    model.add(layers.GRU(128))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(
        num_vals,
        activation='relu')) #alternatively: activation = 'softmax'
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    # --------------------------------------------

    print(model.summary())
    model.fit(x, y, batch_size=128, epochs=epochs)

    return (model)

In [7]:
def train_model_from_text(text, maxlen=6, step=12, epochs=10):
    """
    Given text, train the model.
    
    :param text: A string with all the text together.
    :type  text: str
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :param epochs: number of training iterations
    :type  epochs: int
    :returns: (trained keras model,
               dictionary mapping characters to digit representations)
    :rtype:   (keras.engine.sequential.Sequential,
               dict)
    """
    
    x, y = vectorizing_seq(text, maxlen, step)
    model = create_model(x, y, maxlen, epochs)
    
    return model

In [8]:
def sample(preds, temperature=1.0):
    """
    Compute new probability distribution based on the temperature
    Higher temperature creates more randomness.
    
    :param preds: numpy array of shape (unique chars,), and elements sum to 1
    :type  preds: numpy.ndarray
    :param temperature: characterizes the entropy of probability distribution
    :type  temperature: float
    :returns: a number 0 to the length of preds - 1
    :rtype:   int
    """
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [9]:
#The real money generating part of this generator
def text_generate(model, text, maxlen=4, temperature=1.0, textlen=10):
    """
    Generate text based on a model.
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """

    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen]
    outp = generated_text
    print('--- Generating with temperature {}'.format(temperature))
    print(outp)
    
    
    for i in range(textlen):
        sampled = np.zeros((1, maxlen, num_vals))
        for t, char in enumerate(generated_text):
            #print('(t, char) = ({}, {})'.format(t, char))
            sampled[0, t, min(num_vals - 1, char)] = 1
        preds = model.predict(sampled, verbose=0)[0]
        next_char = sample(preds, temperature)
        generated_text.append(next_char)
        generated_text = generated_text[1:]
        outp.append(next_char)
    
    return outp


## Sample Training and Text Generation

In [10]:
maxlen = 60 # train on sequences of 60 characters (about 6 chords)

model = train_model_from_text(
    training_data,
    maxlen=maxlen,
    step=6, # jump over 10 characters (~ one chord) 
    epochs=15
)

Number of sequences: 27243
Vectorization on (27243, 60, 14) dims...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 60, 256)           208128    
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 256)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               147840    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 14)                1806      
Total params: 357,774
Trainable params: 357,774
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/

In [11]:
def splitz(seq, cut):
    group = []    
    for num in seq:
        if num != cut:
            group.append(num)
        elif group:
            yield group
            group = []

In [12]:
outp = text_generate(
    model, 
    training_data,
    maxlen=maxlen,
    temperature=.9,
    textlen=60
)
list(splitz(outp, 12))

--- Generating with temperature 0.9
[3, 7, 10, 12, 3, 7, 10, 12, 8, 0, 3, 12, 7, 11, 2, 12, 0, 3, 7, 12, 3, 7, 10, 12, 8, 0, 3, 12, 7, 11, 2, 12, 0, 3, 7, 12, 3, 7, 10, 12, 8, 0, 3, 12, 7, 11, 2, 12, 0, 3, 7, 12, 3, 7, 10, 12, 3, 7, 10, 12]


  from ipykernel import kernelapp as app


[[3, 7, 10],
 [3, 7, 10],
 [8, 0, 3],
 [7, 11, 2],
 [0, 3, 7],
 [3, 7, 10],
 [8, 0, 3],
 [7, 11, 2],
 [0, 3, 7],
 [3, 7, 10],
 [8, 0, 3],
 [7, 11, 2],
 [0, 3, 7],
 [3, 7, 10],
 [3, 7, 10],
 [0, 0, 3, 7],
 [8, 0, 3],
 [10, 2, 5],
 [7, 10, 2],
 [7, 10, 2],
 [3, 7, 10],
 [9, 0, 4],
 [4, 7, 11],
 [7, 11, 2],
 [7, 11, 2],
 [0, 4, 7],
 [0, 4, 7],
 [5, 9, 0],
 [0, 4, 7],
 [7, 11, 2]]

In [13]:
import time
model.save('run_{}.h5'.format(time.strftime("%Y%m%d-%H%M%S")))