## Predictive Modeling - Neural network

In [50]:
import re
import numpy as np
import pandas as pd
import json
import sys
from collections import Counter
from keras import layers
from keras.models import Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

In [51]:
library = "C:\\Users\\liblabs-user\\Desktop\\song-authorship\\data"
laptop = "not yet"
desktop = "C:\\Users\\Sam\\Desktop\\song authorship\\data"

Since lyrics do not have sentences, we will delimit the sentences using the new line character.

In [52]:
def clean_text(text, names):
    remove = ["_", "*", "--", ",", "!", "?"]
    repl = {"\<u\+0092\>": "\'"}
    text = text.lower()
    for x in remove:
        text = text.replace(x, "")
    for x in repl:
        text = re.sub(x, repl[x], text)
    return [x.strip() for x in re.split("\n", text) if x != ""]

def data_generator(robot, nonrobot, s_min, s_max, a_min, a_max,
                   lookback, batch_size=128):

    if s_max is None:
        s_max = len(robot)
    if a_max is None:
        a_max = len(nonrobot)

    si = s_min
    ai = a_min

    while 1:

        follow = np.random.randint(0, 2, batch_size)
        sind = []
        aind = []

        for x in follow:
            # robotic is 0
            if x == 0:
                if si >= s_max:
                    si = s_min
                sind.append(si)
                aind.append(-1)
                si += 1
            else:
                if ai >= a_max:
                    ai = a_min
                aind.append(ai)
                sind.append(-1)
                ai += 1

        samples = []

        # now pad sentences and yield.
        for x in range(len(sind)):
            # sampled non-robotic here
            if sind[x] == -1:
                samples.append(nonrobot[aind[x]])
            else:
                samples.append(robot[sind[x]])

        yield np.array(samples), follow
        
def combine_phrases(n, song):
    """
    n: Number of phrases to combine together
    song: list of phrases to combine
    
    returns: list of combined sentences.
    """
    
    add = []
    for i in range(0, len(song) - n, n):
        add.append(" ".join(song[i: i + n]))
    add.append(" ".join(song[i + n:]))
    print(add)

In [61]:
combine_phrases(3, robotic[214])

["walking on air tonight tonight tonight i'm walking on air tonight tonight tonight i'm walking on air", "you're giving me sweet sweet ecstasy yeah you take me to utopia you're reading me like erotica", "boy you make me feel exotic yeah just when i think i can't take anymore we go deeper and hotter than ever before", "we go higher and higher i feel like i'm already there i'm walking on air tonight", "i'm walking on air i'm walking i'm walking on air tonight i'm walking on air", "i'm walking on air tonight i'm walking on air i'm walking i'm walking on air tonight", "i'm walking on air i'm walking on air this is pure paradise", 'even heaven is jealous of our love yes we make angels cry raining down on us from up above', "just when i think i can't take anymore we go deeper and hotter than ever before we go higher and higher", "i feel like i'm already there i'm walking on air tonight i'm walking on air", "i'm walking i'm walking on air tonight i'm walking on air i'm walking on air tonight"

In [53]:
split = 0.8
number_of_words = 10000

In [59]:
df = pd.read_csv(desktop + "\\Weekly_data_top_week.csv")
#names = "(?<![A-Z])(?<![A-Z][A-z])(?<![A-Z][A-z][A-z])\."
names = "\s"

robotic = [clean_text(x, names) for x in df[df.Songwriter == True].Lyrics]
#for i in range(10):
#    print(robotic[i])
nonrobotic = [clean_text(x, names) for x in df[df.Songwriter == False].Lyrics]

# test train split
rt = np.random.choice(len(robotic),
                      round(len(robotic) * split),
                      replace=False)
rtrain = [y.split(" ") for x in rt for y in robotic[x]]
#print(rtrain[0:10])
rtest = [y.split(" ") for x in set(range(len(robotic))).difference(rt) for y in robotic[x]]

nt = np.random.choice(len(nonrobotic),
                      round(len(nonrobotic) * split),
                      replace=False)
ntrain = [y.split(" ") for x in nt for y in nonrobotic[x]]
ntest = [y.split(" ") for x in set(range(len(nonrobotic))).difference(nt) for y in nonrobotic[x]]

# vectorize sentences
tokenizer = Tokenizer(num_words=number_of_words,
                      filters="'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'",
                      lower=True, split=' ')
tokenizer.fit_on_texts(rtrain)
tokenizer.fit_on_texts(ntrain)
tokenizer.fit_on_texts(rtest)
tokenizer.fit_on_texts(ntest)

rtrain = pad_sequences(tokenizer.texts_to_sequences(rtrain), 10)
#print()
#print(rtrain[0:10])
rtest = pad_sequences(tokenizer.texts_to_sequences(rtest), 10)
ntrain = pad_sequences(tokenizer.texts_to_sequences(ntrain), 10)
ntest = pad_sequences(tokenizer.texts_to_sequences(ntest), 10)

In [6]:
max(tokenizer.word_index.values())

34027

In [10]:
lookback = 10
batch_size = 64

#stoker = pad_sequences(np.array(json.load(open(sys.argv[1].strip()))), 60)
#austen = pad_sequences(np.array(json.load(open(sys.argv[2].strip()))), 60)

#stoker_train = stoker[:round(len(stoker) * 0.9)]
#stoker_val = stoker[round(len(stoker) * 0.9):]

#austen_train = austen[:round(len(austen) * 0.9)]
#austen_val = austen[round(len(austen) * 0.9):]

train_gen = data_generator(robot=rtrain,
                           nonrobot=ntrain,
                           s_min=0,
                           s_max=len(rtrain),
                           a_min=0,
                           a_max=len(ntrain),
                           lookback=lookback,
                           batch_size=batch_size)

val_gen = data_generator(robot=rtest,
                         nonrobot=ntest,
                         s_min=0,
                         s_max=len(rtest),
                         a_min=0,
                         a_max=len(ntest),
                         lookback=lookback,
                         batch_size=batch_size)

model = Sequential()
model.add(layers.Embedding(number_of_words, 128))
model.add(layers.LSTM(16,
                        activation="relu",
                        recurrent_dropout=0.1,
                        return_sequences=True))
model.add(layers.LSTM(16,
                        activation="relu",
                        recurrent_dropout=0.1,
                        return_sequences=True))
model.add(layers.LSTM(16,
                     activation="relu",
                     recurrent_dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))

model.compile(optimizer=RMSprop(),
              loss="binary_crossentropy",
                metrics=["acc"])

filepath = "weights-improvement-{epoch:02d}-{val_acc:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc',
                             verbose=1, save_best_only=True,
                             mode='max')
callbacks_list = [checkpoint]

history = model.fit_generator(train_gen,
                              steps_per_epoch=200,
                              epochs=100,
                              callbacks=callbacks_list,
                              validation_data=val_gen,
                              validation_steps=200)

#with open("Embedding.dat", "w") as f:
#    porv = load_model("RoboticorNot.h5")
#    json.dump(list([list(x) for x in
#                    porv.layers[0].get_weights()[0].astype(float)]), f)

Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.50742, saving model to weights-improvement-01-0.5074.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.50742 to 0.52023, saving model to weights-improvement-02-0.5202.h5
Epoch 3/100

Epoch 00003: val_acc did not improve
Epoch 4/100

Epoch 00004: val_acc improved from 0.52023 to 0.54602, saving model to weights-improvement-04-0.5460.h5
Epoch 5/100

Epoch 00005: val_acc did not improve
Epoch 6/100

Epoch 00006: val_acc did not improve
Epoch 7/100

Epoch 00007: val_acc did not improve
Epoch 8/100

Epoch 00008: val_acc improved from 0.54602 to 0.55320, saving model to weights-improvement-08-0.5532.h5
Epoch 9/100

Epoch 00009: val_acc did not improve
Epoch 10/100

Epoch 00010: val_acc improved from 0.55320 to 0.55422, saving model to weights-improvement-10-0.5542.h5
Epoch 11/100

Epoch 00011: val_acc did not improve
Epoch 12/100

Epoch 00012: val_acc did not improve
Epoch 13/100

Epoch 00013: val_acc did not improve
Epoch 14/100
 

KeyboardInterrupt: 

In [None]:
h = history.history
plt.plot(h["acc"], c = "b")
plt.plot(h["val_acc"], c = "r")