## Predictive Modeling - Neural network

In [1]:
import re
import numpy as np
import pandas as pd
import json
import sys
from keras import layers
from keras.models import Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
library = "C:\\Users\\liblabs-user\\Desktop\\song-authorship\\data"
laptop = "not yet"
desktop = "C:\\Users\\Sam\\Desktop\\song authorship\\data"

Since lyrics do not have sentences, we will delimit the sentences using the new line character.

In [3]:
split = 0.8

In [4]:
def clean_text(text, names):
    remove = ["_", "*", "--", ",", "!"]
    for x in remove:
        text = text.replace(x, "")
    return [re.sub("\n+", " ", x).strip()
            for x in re.split(names, text) if x != ""]

def data_generator(robot, nonrobot, s_min, s_max, a_min, a_max,
                   lookback, batch_size=128):

    if s_max is None:
        s_max = len(robot)
    if a_max is None:
        a_max = len(nonrobot)

    si = s_min
    ai = a_min

    while 1:

        follow = np.random.randint(0, 2, batch_size)
        sind = []
        aind = []

        for x in follow:
            # robotic is 0
            if x == 0:
                if si >= s_max:
                    si = s_min
                sind.append(si)
                aind.append(-1)
                si += 1
            else:
                if ai >= a_max:
                    ai = a_min
                aind.append(ai)
                sind.append(-1)
                ai += 1

        samples = []

        # now pad sentences and yield.
        for x in range(len(sind)):
            # sampled non-robotic here
            if sind[x] == -1:
                samples.append(nonrobot[aind[x]])
            else:
                samples.append(robot[sind[x]])

        yield np.array(samples), follow

In [15]:
df = pd.read_csv(desktop + "\\Weekly_data_top_week.csv")
names = "(?<![A-Z])(?<![A-Z][A-z])(?<![A-Z][A-z][A-z])\."

robotic = [clean_text(x, names) for x in df[df.Songwriter == True].Lyrics]
nonrobotic = [clean_text(x, names) for x in df[df.Songwriter == False].Lyrics]

# test train split
rt = np.random.choice(len(robotic),
                      round(len(robotic) * split),
                      replace=False)
rtrain = [robotic[x] for x in rt]
rtest = [robotic[x] for x in set(range(len(robotic))).difference(rt)]

nt = np.random.choice(len(nonrobotic),
                      round(len(nonrobotic) * split),
                      replace=False)
ntrain = [nonrobotic[x] for x in nt]
ntest = [nonrobotic[x] for x in set(range(len(nonrobotic))).difference(nt)]

# vectorize sentences
tokenizer = Tokenizer(num_words=5000,
                      filters="'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'",
                      lower=True, split=' ')
tokenizer.fit_on_texts(robotic)
tokenizer.fit_on_texts(nonrobotic)

rtrain = pad_sequences(tokenizer.texts_to_sequences(rtrain), 20)
rtest = pad_sequences(tokenizer.texts_to_sequences(rtest), 20)
ntrain = pad_sequences(tokenizer.texts_to_sequences(ntrain), 20)
ntest = pad_sequences(tokenizer.texts_to_sequences(ntest), 20)

In [16]:
max(tokenizer.word_index.values())

2308

In [18]:
lookback = 10
batch_size = 64

#stoker = pad_sequences(np.array(json.load(open(sys.argv[1].strip()))), 60)
#austen = pad_sequences(np.array(json.load(open(sys.argv[2].strip()))), 60)

#stoker_train = stoker[:round(len(stoker) * 0.9)]
#stoker_val = stoker[round(len(stoker) * 0.9):]

#austen_train = austen[:round(len(austen) * 0.9)]
#austen_val = austen[round(len(austen) * 0.9):]

train_gen = data_generator(robot=rtrain,
                           nonrobot=ntrain,
                           s_min=0,
                           s_max=len(rtrain),
                           a_min=0,
                           a_max=len(ntrain),
                           lookback=lookback,
                           batch_size=batch_size)

val_gen = data_generator(robot=rtest,
                         nonrobot=ntest,
                         s_min=0,
                         s_max=len(rtest),
                         a_min=0,
                         a_max=len(ntest),
                         lookback=lookback,
                         batch_size=batch_size)

model = Sequential()
model.add(layers.Embedding(2309, 16))
model.add(layers.LSTM(32,
                        activation="relu",
                        dropout=0.2,
                        recurrent_dropout=0.2,
                        return_sequences=True))
model.add(layers.LSTM(32,
                        activation="relu",
                        dropout=0.2,
                        recurrent_dropout=0.2,
                        return_sequences=True))
model.add(layers.LSTM(32,
                     activation="relu",
                     dropout=0.2,
                     recurrent_dropout=0.2))
model.add(layers.Dense(1, activation="sigmoid"))

model.compile(optimizer=RMSprop(),
              loss="binary_crossentropy",
                metrics=["acc"])

filepath = "weights-improvement-{epoch:02d}-{val_acc:.4f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc',
                             verbose=1, save_best_only=True,
                             mode='max')
callbacks_list = [checkpoint]

history = model.fit_generator(train_gen,
                              steps_per_epoch=200,
                              epochs=50,
                              callbacks=callbacks_list,
                              validation_data=val_gen,
                              validation_steps=200)

#with open("Embedding.dat", "w") as f:
#    porv = load_model("RoboticorNot.h5")
#    json.dump(list([list(x) for x in
#                    porv.layers[0].get_weights()[0].astype(float)]), f)

Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.49727, saving model to weights-improvement-01-0.4973.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.49727 to 0.52594, saving model to weights-improvement-02-0.5259.h5
Epoch 3/50

Epoch 00003: val_acc did not improve
Epoch 4/50

Epoch 00004: val_acc did not improve
Epoch 5/50

Epoch 00005: val_acc did not improve
Epoch 6/50

Epoch 00006: val_acc improved from 0.52594 to 0.52867, saving model to weights-improvement-06-0.5287.h5
Epoch 7/50

Epoch 00007: val_acc did not improve
Epoch 8/50

Epoch 00008: val_acc improved from 0.52867 to 0.53914, saving model to weights-improvement-08-0.5391.h5
Epoch 9/50

Epoch 00009: val_acc did not improve
Epoch 10/50

Epoch 00010: val_acc did not improve
Epoch 11/50

Epoch 00011: val_acc did not improve
Epoch 12/50

Epoch 00012: val_acc did not improve
Epoch 13/50

Epoch 00013: val_acc did not improve
Epoch 14/50

Epoch 00014: val_acc did not improve
Epoch 15/50

Epoch 00015: val_acc did not im

KeyboardInterrupt: 