In [103]:
import pandas as pd
import csv
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Lambda, LSTM, Dense, Dropout, Input, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from gensim.test.utils import simple_preprocess
import os
import re
import pickle
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from numba import cuda 

In [2]:
print(tf.test.is_gpu_available())
print(tf.config.list_physical_devices('GPU'))

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
def clear_GPU():
    device = cuda.get_current_device()
    device.reset()

In [3]:
MODE_PDUMP = False

## Data Loading from lyrics_l.pkl

In [4]:
fin_fil_df = pd.read_pickle('pickles/fin.pkl')

In [5]:
if MODE_PDUMP:
    directory_path = 'data/music4all_subset/lyrics'
    file_prefixes = fin_fil_df['id'].tolist()
    lyrics_d = {}
    for prefix in file_prefixes:
        file_pattern = prefix + '.txt'
        file_path = os.path.join(directory_path, file_pattern)
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding="utf8") as file:
                content = file.read()
                lyrics_d[prefix] = content
                lyrics_l.append(content)
                # print(f"Content of {file_pattern}:\n{content}")
    
    with open('pickles/lyrics_l.pkl', 'wb') as f:
        pickle.dump(lyrics_l, f)

In [6]:
with open('pickles/lyrics_l.pkl', 'rb') as f:
    lyrics_l = pickle.load(f)

## Preprocessing

In [7]:
def preprocess_lyrics(lyli):
    def prep_single(ly):
        text = re.sub(r'[^\w\s]', '', ly).lower()
        text = text.replace('\n', ' ')
        text = re.sub(' +', ' ', text)
        return text
    new = [prep_single(l) for l in lyli]
    return new

In [69]:
prep_lyrics = preprocess_lyrics(lyrics_l)

In [154]:
encoder = LabelEncoder()
labels = fin_fil_df['genre']
genre_labels = encoder.fit_transform(labels)
print(encoder.classes_)
X_train, X_test, y_train, y_test = train_test_split(prep_lyrics, genre_labels, test_size=0.2)

['alternative rock' 'ambient' 'classic rock' 'electronic' 'folk'
 'indie rock' 'metal' 'pop' 'rap' 'singer-songwriter' 'soul']


In [155]:
if MODE_PDUMP:
    with open('pickles/lyrics_ttsdata.pkl', 'wb') as f:
        pickle.dump((X_train, X_test, y_train, y_test), f)

## Lyric Models

In [83]:
es_cb = EarlyStopping(monitor='val_loss', patience=3, verbose=1)
def getModelCheckpoint(name):
    return ModelCheckpoint(
        filepath=f'models/{name}.h5',
        save_best_only=True,
        monitor='val_accuracy',
        mode='max',
        verbose=1
    )

### ELMo with Dense Net

In [31]:
elmo = hub.load('https://tfhub.dev/google/elmo/3').signatures['default']

In [12]:
def elmo_vectors(x):
    embeddings = elmo(tf.constant(x))["elmo"]
    return tf.reduce_mean(embeddings, 1)

In [None]:
if MODE_PDUMP:
    list_train = [X_train[i:i + 100] for i in range(0, len(X_train), 100)]
    list_test = [X_test[i:i + 100] for i in range(0, len(X_test), 100)]
    elmo_train = [elmo_vectors(x) for x in list_train]
    elmo_test = [elmo_vectors(x) for x in list_test]
    elmo_train_new = np.concatenate(elmo_train, axis = 0)
    elmo_test_new = np.concatenate(elmo_test, axis = 0)

    with tf.device('/GPU:0'):
        gpu_lyrics = tf.constant(prep_lyrics)
        embeddings = elmo(gpu_lyrics)['elmo']

In [84]:
if not MODE_PDUMP:
    with open('pickles/elmo0.pkl', 'rb') as f:
        e0 = pickle.load(f)
    with open('pickles/elmo1.pkl', 'rb') as f:
        e1 = pickle.load(f)
    with open('pickles/elmo2.pkl', 'rb') as f:
        e2 = pickle.load(f)
    with open('pickles/elmo3.pkl', 'rb') as f:
        e3 = pickle.load(f)
    with open('pickles/elmo_test.pkl', 'rb') as f:
        e_test = pickle.load(f)
    e0 = tf.concat(e0, axis=0)
    e1 = tf.concat(e1, axis=0)
    e2 = tf.concat(e2, axis=0)
    e3 = tf.concat(e3, axis=0)
    e_test = tf.concat(e_test, axis=0)
    e_train = tf.concat([e0, e1, e2], axis=0)

In [85]:
e_train

<tf.Tensor: shape=(26850, 1024), dtype=float32, numpy=
array([[ 0.10733027, -0.13069776, -0.00867425, ..., -0.1278605 ,
         0.45776153, -0.02518587],
       [-0.04765201, -0.1638587 ,  0.05751887, ...,  0.04533917,
         0.27689737,  0.03030533],
       [ 0.07918552, -0.1052879 ,  0.09017083, ...,  0.01571165,
         0.35622203, -0.05648162],
       ...,
       [-0.03044983, -0.06164612,  0.03256677, ...,  0.03354283,
         0.03826979, -0.0157944 ],
       [ 0.11266568, -0.17386876,  0.07004549, ...,  0.0156843 ,
         0.36772186,  0.00108926],
       [-0.00505059, -0.0863753 ,  0.07504102, ...,  0.05298683,
         0.13353385, -0.05152311]], dtype=float32)>

In [86]:
num_classes = 11

elmo_dense = tf.keras.Sequential([
    Dense(1024, activation='relu'),
    Dropout(.5),
    Dense(512, activation='relu'),
    Dropout(.5),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

chosen_opt ='adam'
elmo_dense.compile(loss='sparse_categorical_crossentropy', optimizer=chosen_opt, metrics=['accuracy'])

In [87]:
elmo_dense.fit(e_train, y_train[:26850], validation_data=(e_test, y_train[26850:]), epochs=10, batch_size=32,
             callbacks=[es_cb, getModelCheckpoint("elmo_dense")])

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.27727, saving model to models\elmo_dense.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.27727
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.27727
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.27727
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.27727
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.27727
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.27727
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.27727
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.27727
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.27727


<keras.callbacks.History at 0x2973e12d910>

### Word2Vec with Dense Net

In [104]:
p_sen = [simple_preprocess(s) for s in X_train]

In [105]:
w2v = Word2Vec(p_sen, vector_size=1000, window=20, min_count=5, sg=0)
w2v.train(p_sen, total_examples=len(p_sen), epochs=30)



(194474675, 265219020)

In [107]:
w2v_emb_train = []
for s in X_train:
    words = simple_preprocess(s)
    wb = []
    for word in words:
        if word in w2v.wv:
            wb.append(w2v.wv[word])
    w2v_emb_train.append(np.mean(wb, axis=0))

In [117]:
w2v_emb_test = []
for s in X_test:
    words = simple_preprocess(s)
    wb = []
    for word in words:
        if word in w2v.wv:
            wb.append(w2v.wv[word])
    w2v_emb_test.append(np.mean(wb, axis=0))

In [118]:
w2v_train = np.vstack(w2v_emb_train)
w2v_test = np.vstack(w2v_emb_test)

In [127]:
num_classes = 11

w2v_dense = tf.keras.Sequential([
    Dense(1024, activation='relu', input_dim=w2v.vector_size),
    Dropout(.5),
    Dense(512, activation='relu'),
    Dropout(.5),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

chosen_opt ='adam'
w2v_dense.compile(loss='sparse_categorical_crossentropy', optimizer=chosen_opt, metrics=['accuracy'])

In [128]:
w2v_dense.fit(w2v_train, y_train, validation_data=(w2v_test, y_test), epochs=30, batch_size=32,
             callbacks=[es_cb, getModelCheckpoint("w2v_dense")])

Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.38498, saving model to models\w2v_dense.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.38498 to 0.39813, saving model to models\w2v_dense.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.39813 to 0.40002, saving model to models\w2v_dense.h5
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.40002
Epoch 5/30
Epoch 5: val_accuracy improved from 0.40002 to 0.40169, saving model to models\w2v_dense.h5
Epoch 6/30
Epoch 6: val_accuracy improved from 0.40169 to 0.40994, saving model to models\w2v_dense.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.40994 to 0.41117, saving model to models\w2v_dense.h5
Epoch 8/30
Epoch 8: val_accuracy did not improve from 0.41117
Epoch 9/30
Epoch 9: val_accuracy improved from 0.41117 to 0.41195, saving model to models\w2v_dense.h5
Epoch 10/30
Epoch 10: val_accuracy did not improve from 0.41195
Epoch 11/30
Epoch 11: val_accuracy did not improve from 0.41195
Epoch 12/30
Epoch 12: val_accu

<keras.callbacks.History at 0x29b04f9d760>

In [150]:
w2v_dense.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_60 (Dense)            (None, 1024)              1025024   
                                                                 
 dropout_44 (Dropout)        (None, 1024)              0         
                                                                 
 dense_61 (Dense)            (None, 512)               524800    
                                                                 
 dropout_45 (Dropout)        (None, 512)               0         
                                                                 
 dense_62 (Dense)            (None, 256)               131328    
                                                                 
 dense_63 (Dense)            (None, 11)                2827      
                                                                 
Total params: 1,683,979
Trainable params: 1,683,979
N

### GloVe with Dense Net

In [131]:
embed_dict = {}
with open('glove/glove.6B/glove.6B.300d.txt','r', encoding="utf-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    embed_dict[word] = vector

In [133]:
glove_emb_train = []
for s in X_train:
    words = s.split()
    wb = []
    for word in words:
        if word in embed_dict:
            wb.append(embed_dict[word])
    glove_emb_train.append(np.mean(wb, axis=0))

In [134]:
glove_emb_test = []
for s in X_test:
    words = s.split()
    wb = []
    for word in words:
        if word in embed_dict:
            wb.append(embed_dict[word])
    glove_emb_test.append(np.mean(wb, axis=0))

In [139]:
glove_train = np.vstack(glove_emb_train)
glove_test = np.vstack(glove_emb_test)

In [142]:
num_classes = 11

glove_dense = tf.keras.Sequential([
    Dense(1024, activation='relu', input_dim=300),
    Dropout(.5),
    Dense(512, activation='relu'),
    Dropout(.5),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

chosen_opt ='adam'
glove_dense.compile(loss='sparse_categorical_crossentropy', optimizer=chosen_opt, metrics=['accuracy'])

In [143]:
glove_dense.fit(glove_train, y_train, validation_data=(glove_test, y_test), epochs=30, batch_size=32,
             callbacks=[es_cb, getModelCheckpoint("glove_dense")])

Epoch 1/30
Epoch 1: val_accuracy improved from -inf to 0.35366, saving model to models\glove_dense.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.35366 to 0.35923, saving model to models\glove_dense.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.35923 to 0.37338, saving model to models\glove_dense.h5
Epoch 4/30
Epoch 4: val_accuracy improved from 0.37338 to 0.37394, saving model to models\glove_dense.h5
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.37394
Epoch 6/30
Epoch 6: val_accuracy improved from 0.37394 to 0.37963, saving model to models\glove_dense.h5
Epoch 7/30
Epoch 7: val_accuracy did not improve from 0.37963
Epoch 8/30
Epoch 8: val_accuracy improved from 0.37963 to 0.38085, saving model to models\glove_dense.h5
Epoch 9/30
Epoch 9: val_accuracy did not improve from 0.38085
Epoch 10/30
Epoch 10: val_accuracy did not improve from 0.38085
Epoch 11/30
Epoch 11: val_accuracy did not improve from 0.38085
Epoch 11: early stopping


<keras.callbacks.History at 0x29b4e6d40a0>

### BERT Tokenizer with Dense Net

In [72]:
if MODE_PDUMP:
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    
    train_tokenized_sentences = tokenizer(X_train, padding=True, return_tensors="tf")
    test_tokenized_sentences = tokenizer(X_test, padding=True, return_tensors="tf")
    
    print(train_tokenized_sentences['input_ids'].shape)
    
    train_tokenized_sentences = train_tokenized_sentences['input_ids']
    test_tokenized_sentences = test_tokenized_sentences['input_ids']
    with open('pickles/bert_tok_train.pkl', 'wb') as f:
        pickle.dump(train_tokenized_sentences, f)
    with open('pickles/bert_tok_test.pkl', 'wb') as f:
        pickle.dump(test_tokenized_sentences, f)
else:
    with open('pickles/bert_tok_train.pkl', 'rb') as f:
        train_tokenized_sentences = pickle.load(f)
    with open('pickles/bert_tok_test.pkl', 'rb') as f:
        test_tokenized_sentences = pickle.load(f)

Token indices sequence length is longer than the specified maximum sequence length for this model (822 > 512). Running this sequence through the model will result in indexing errors


(35888, 5184)


In [75]:
# train_tokenized_sentences.shape
test_tokenized_sentences.shape

TensorShape([8972, 2713])

In [74]:
# Needed shape for the test tensor
desired_shape = (test_tokenized_sentences.shape[0], train_tokenized_sentences.shape[1])
print(desired_shape)
num_columns_to_add = desired_shape[1] - test_tokenized_sentences.shape[1]
print(num_columns_to_add)
zeros_to_add = tf.zeros((desired_shape[0], num_columns_to_add), dtype=test_tokenized_sentences.dtype)
resulting_tensor_x_test = tf.concat([test_tokenized_sentences, zeros_to_add], axis=1)

(8972, 5184)
2471


In [77]:
resulting_tensor_x_test

<tf.Tensor: shape=(8972, 5184), dtype=int32, numpy=
array([[  101,  1045, 10587, ...,     0,     0,     0],
       [  101,  3280,  8814, ...,     0,     0,     0],
       [  101,  2040,  2097, ...,     0,     0,     0],
       ...,
       [  101,  3564,  2006, ...,     0,     0,     0],
       [  101,  2292,  2033, ...,     0,     0,     0],
       [  101,  2129,  2079, ...,     0,     0,     0]])>

In [80]:
num_classes = 11

bert_dense = tf.keras.Sequential([
    Dense(2048, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.5),
    Dense(1024, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.5),
    Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.5),
    Dense(256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.5),
    Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(.5),
    Dense(num_classes, activation='softmax')
])

chosen_opt = tf.keras.optimizers.Adam(1e-4)
# chosen_opt ='adam'
bert_dense.compile(loss='sparse_categorical_crossentropy', optimizer=chosen_opt, metrics=['accuracy'])

In [81]:
bert_dense.fit(train_tokenized_sentences, y_train, validation_data=(resulting_tensor_x_test, y_test), epochs=10, batch_size=32,
               callbacks=[es_cb, getModelCheckpoint("bert_dense")])

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.30105, saving model to models\bert_dense.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.30105 to 0.30261, saving model to models\bert_dense.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.30261 to 0.30461, saving model to models\bert_dense.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.30461 to 0.30885, saving model to models\bert_dense.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.30885 to 0.31242, saving model to models\bert_dense.h5
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.31242
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.31242
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.31242
Epoch 9/10
Epoch 9: val_accuracy improved from 0.31242 to 0.31587, saving model to models\bert_dense.h5
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.31587


<keras.callbacks.History at 0x29933211dc0>

In [65]:
l, a = bert_dense.evaluate(resulting_tensor_x_test, y_test, verbose=2)
print(f'Test Accuracy: {a}')

281/281 - 1s - loss: 2.0283 - accuracy: 0.3182 - 1s/epoch - 4ms/step
Test Accuracy: 0.31821221113204956


### LSTM Network with Tokenizer

In [17]:
t = Tokenizer(oov_token='<UNK>')
t.fit_on_texts(X_train)
t.word_index['<PAD>'] = 0

X_train_tok_lstm = t.texts_to_sequences(X_train)
X_test_tok_lstm = t.texts_to_sequences(X_test)

vocab_size = len(t.word_index)
maxlen = len(max(X_train_tok_lstm, key=len))
emb_dim = 300

X_train_tok_lstm = sequence.pad_sequences(X_train_tok_lstm, maxlen=maxlen)
X_test_tok_lstm = sequence.pad_sequences(X_test_tok_lstm, maxlen=maxlen)

In [21]:
num_classes = 11

tok_lstm = tf.keras.Sequential([
    Embedding(vocab_size, emb_dim, input_length=maxlen),
    LSTM(256, return_sequences=True, dropout=0.3),
    Dropout(0.5),
    LSTM(128, return_sequences=True, dropout=0.3),
    Dropout(0.5),
    LSTM(64, return_sequences=False, dropout=0.3),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# chosen_opt = tf.keras.optimizers.Adam(1e-4)
chosen_opt ='adam'
tok_lstm.compile(loss='sparse_categorical_crossentropy', optimizer=chosen_opt, metrics=['accuracy'])

In [22]:
tok_lstm.fit(X_train_tok_lstm, y_train, validation_data=(X_test_tok_lstm, y_test), epochs=10, batch_size=32,
             callbacks=[es_cb, getModelCheckpoint("tok_lstm")])

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.27809, saving model to models\tok_lstm.h5
Epoch 2/10
Epoch 2: val_accuracy did not improve from 0.27809
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.27809
Epoch 4/10
Epoch 4: val_accuracy improved from 0.27809 to 0.28790, saving model to models\tok_lstm.h5
Epoch 5/10
Epoch 5: val_accuracy improved from 0.28790 to 0.34641, saving model to models\tok_lstm.h5
Epoch 6/10
Epoch 6: val_accuracy improved from 0.34641 to 0.35990, saving model to models\tok_lstm.h5
Epoch 7/10
Epoch 7: val_accuracy improved from 0.35990 to 0.36469, saving model to models\tok_lstm.h5
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.36469
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.36469
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.36469
Epoch 10: early stopping


<keras.callbacks.History at 0x21c856941c0>

In [23]:
num_classes = 11

tok_lstm_bi = tf.keras.Sequential([
    Embedding(vocab_size, emb_dim, input_length=maxlen),
    Bidirectional(LSTM(256, return_sequences=True, dropout=0.3), 'concat'),
    Dropout(0.5),
    LSTM(128, return_sequences=True, dropout=0.3),
    Dropout(0.5),
    LSTM(64, return_sequences=False, dropout=0.3),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

# chosen_opt = tf.keras.optimizers.Adam(1e-4)
chosen_opt ='adam'
tok_lstm_bi.compile(loss='sparse_categorical_crossentropy', optimizer=chosen_opt, metrics=['accuracy'])

In [24]:
tok_lstm_bi.fit(X_train_tok_lstm, y_train, validation_data=(X_test_tok_lstm, y_test), epochs=10, batch_size=32,
             callbacks=[es_cb, getModelCheckpoint("tok_lstm_bi")])

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.28890, saving model to models\tok_lstm_bi.h5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.28890 to 0.31765, saving model to models\tok_lstm_bi.h5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.31765 to 0.31799, saving model to models\tok_lstm_bi.h5
Epoch 4/10
Epoch 4: val_accuracy improved from 0.31799 to 0.34318, saving model to models\tok_lstm_bi.h5
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.34318
Epoch 6/10
Epoch 6: val_accuracy improved from 0.34318 to 0.35354, saving model to models\tok_lstm_bi.h5
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.35354
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.35354
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.35354
Epoch 9: early stopping


<keras.callbacks.History at 0x21c8573ac10>