## 1.- Read the file from Google Docs

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"1JBkradqUL3LC5AFIYHyqJqUWxTz4WwBF"})
downloaded.GetContentFile('train_new.csv')

downloaded = drive.CreateFile({'id':"1MJkjBEICSZcXnFjXvZQx1LgOuJCsVokI"})
downloaded.GetContentFile('test_new.csv')

## 2.- Prepare Data

In [None]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('train_new.csv')
df_test  = pd.read_csv('test_new.csv')

In [None]:
df_train.head()

Unnamed: 0,n_token,n_words,genre,lyric_token
0,124,88,0,"[182, 10911, 80592, 10113, 10614, 10437, 11675..."
1,90,65,0,"[110694, 26561, 50172, 10121, 14444, 110611, 1..."
2,142,102,0,"[186, 24109, 10317, 10104, 10614, 12238, 25190..."
3,86,60,0,"[14444, 59599, 10133, 10690, 90880, 10129, 522..."
4,171,134,0,"[10183, 12058, 50339, 10499, 17025, 67099, 101..."


In [None]:
df_test.head()

Unnamed: 0,n_token,n_words,genre,lyric_token
0,80,39,0,"[164, 75151, 10731, 166, 80592, 10113, 12556, ..."
1,58,28,0,"[164, 75151, 10731, 166, 110611, 14541, 10280,..."
2,82,39,0,"[164, 75151, 10731, 166, 24109, 10317, 25190, ..."
3,50,25,0,"[164, 75151, 10731, 166, 10690, 90880, 10129, ..."
4,89,50,0,"[164, 75151, 10731, 166, 50339, 10499, 17025, ..."


In [None]:
df_train.groupby('genre').count()

Unnamed: 0_level_0,n_token,n_words,lyric_token
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8373,8373,8373
1,8373,8373,8373
2,8373,8373,8373
3,8373,8373,8373
4,8373,8373,8373
5,8373,8373,8373


In [None]:
def str2list(lyrics):
  data = []
  for lyric in lyrics:
    lyrics = eval(lyric)
    data.append(lyrics)
  return data

In [None]:
df_test['tokens']  = str2list(df_test['lyric_token'].values)
df_train['tokens'] = str2list(df_train['lyric_token'].values)

In [None]:
df_train.shape, df_test.shape

((50238, 5), (4982, 5))

In [None]:
df_test['tokens']

1500

# Prepare Data

In [None]:
import tensorflow as tf

def token2data(lyric_token, y):
  BATCH_SIZE = 32
  
  lyrics_len = [[lyric, y[i], len(lyric)] for i, lyric in enumerate(lyric_token)]
  lyrics_len.sort(key=lambda x: x[2])
  sorted_lyrics_labels = [(lyrics_lab[0], lyrics_lab[1]) for lyrics_lab in lyrics_len]
  processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_lyrics_labels, output_types=(tf.int64, tf.int64))
  return processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
train_data = token2data(df_train['tokens'].values, df_train['genre'].values)
test_data  = token2data(df_test['tokens'].values,  df_test['genre'].values)

In [None]:
train_data, test_data

(<PaddedBatchDataset shapes: ((None, None), (None,)), types: (tf.int64, tf.int64)>,
 <PaddedBatchDataset shapes: ((None, None), (None,)), types: (tf.int64, tf.int64)>)

In [None]:
next(iter(train_data))

NameError: ignored

# CNN Model


In [None]:
from tensorflow.keras import layers

class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions, 
                                          input_length=1500)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes, activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        return model_output

In [None]:
text_model = TEXT_MODEL(vocabulary_size      = 119547, 
                        embedding_dimensions = 758,
                        cnn_filters          = 100,
                        dnn_units            = 256,
                        model_output_classes = 6,
                        dropout_rate         = 0.2)

In [None]:
text_model.compile(loss="sparse_categorical_crossentropy",
                   optimizer="adam",
                   metrics=["sparse_categorical_accuracy"])

In [None]:
text_model.fit(train_data, epochs=5)

Epoch 1/5
     82/Unknown - 458s 6s/step - loss: 0.2751 - sparse_categorical_accuracy: 0.9601

KeyboardInterrupt: ignored

In [None]:
text_model.summary()

Model: "text_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  23909400  
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  77056     
_________________________________________________________________
dropout (Dropout)            multiple                  0

In [None]:
loss, accuracy = text_model.evaluate(test_data)
print(f'Loss:     {loss}')
print(f'Accuracy: {accuracy}')

Loss:     21.57598876953125
Accuracy: 0.28281813859939575


In [None]:
loss, accuracy = text_model.evaluate(test_data)
print(f'Loss:     {loss}')
print(f'Accuracy: {accuracy}')

Loss:     2.2644307613372803
Accuracy: 0.4191087782382965


In [None]:
text_model.save('cnn_task1_4')

INFO:tensorflow:Assets written to: cnn_task1_4/assets


In [None]:
ls -R cnn_task1_4/

cnn_task1_4/:
[0m[01;34massets[0m/  saved_model.pb  [01;34mvariables[0m/

cnn_task1_4/assets:

cnn_task1_4/variables:
variables.data-00000-of-00001  variables.index


In [None]:
#!mkdir -p saved_model
from google.colab import files
files.download('cnn_task1_4/saved_model.pb')
files.download('cnn_task1_4/variables/variables.index')
files.download('cnn_task1_4/variables/variables.data-00000-of-00001')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# RNN

In [None]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from tensorflow.keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_NB_WORDS = 119547       # The maximum number of words to be used. (most frequent)
MAX_SEQUENCE_LENGTH = 250   # Max number of words in each lyrics.
EMBEDDING_DIM = 100         # This is fixed.

In [None]:
def getXseq(df):
  lyrics_data = []
  for lyrics in df['lyric_token']:
    lyrics = eval(lyrics)
    lyrics_data.append(lyrics)
  return pad_sequences(lyrics_data, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
X = getXseq(df_train)
y = pd.get_dummies(df_train['genre']).values
X.shape, y.shape

((50202, 250), (50202, 6))

In [None]:
X_test = getXseq(df_test)
y_test = pd.get_dummies(df_test['genre']).values
X_test.shape, y_test.shape

((4982, 250), (4982, 6))

In [None]:
text_model2 = Sequential()
text_model2.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
text_model2.add(SpatialDropout1D(0.2))
text_model2.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
text_model2.add(Dense(6, activation='softmax'))

text_model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
text_model2.fit(X, y,
                epochs=5,
                batch_size=64,
                #validation_split=0.1,
                callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f52f9366ed0>

In [None]:
text_model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 250, 100)          11954700  
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 606       
Total params: 12,035,706
Trainable params: 12,035,706
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss, accuracy = text_model2.evaluate(X_test, y_test)
print(f'Loss:     {loss}')
print(f'Accuracy: {accuracy}')

Loss:     1.6244009733200073
Accuracy: 0.3727418780326843


In [None]:
text_model2.save('rnn_task1.h5')

In [None]:
from google.colab import files
files.download('rnn_task1.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>