In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Generate the Corpus

In [3]:
downloaded = drive.CreateFile({'id':"13H3uAYY6DpKO9zJkniPAckyUwzxGubL-"})
downloaded.GetContentFile('train.csv')

In [4]:
import pandas as pd
data = pd.read_csv('train.csv')
data.columns

Index(['Artist', 'Song', 'Genre', 'Language', 'Lyrics'], dtype='object')

In [5]:
data = data[data['Lyrics'].notnull() ]
data = data[data['Genre'].notnull() ]
data = data[data['Language']=='en' ]
data = data[(data['Genre']=='Rock') | (data['Genre']=='Pop') | (data['Genre']=='Metal') ]
data = data[['Genre', 'Lyrics']]

In [6]:
import re

def preprocess_text(txt):
  text = str(txt).lower()
  text = re.sub('[^a-zA-Z,!?]', ' ', text)
  text = re.sub(r"\s+[a-zA-Z,!?]\s+", ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text

def getXs(df):
  res  = []
  for text in df['Lyrics'].values:
    res.append(preprocess_text(text))
  return res

def clean_df(df):
  df['n_words'] = df['Lyrics'].str.split().apply(len)
  df = df[df['n_words'] > 50]
  df = df[df['n_words'] < 100]
  return df

In [7]:
data['Lyrics'] = getXs(data)

In [8]:
data = clean_df(data)

In [9]:
data.groupby('Genre').count().sort_values('Lyrics')

Unnamed: 0_level_0,Lyrics,n_words
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Metal,1832,1832
Pop,3270,3270
Rock,9231,9231


In [10]:
samp = data.sample(frac =.5)
samp.groupby('Genre').count().sort_values('Lyrics')

Unnamed: 0_level_0,Lyrics,n_words
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Metal,922,922
Pop,1595,1595
Rock,4649,4649


In [11]:
from collections import Counter

def getText(df):
  res = ''
  for text in df['Lyrics'].values:
    res += str(text).lower()
  return res

def enumLetters(text):
  chars = sorted(list(set(text)))
  char_to_int = dict((c, i) for i, c in enumerate(chars))
  return char_to_int

In [12]:
metal = getText(samp[ samp['Genre']=='Metal' ])
pop   = getText(samp[ samp['Genre']=='Pop' ])
rock  = getText(samp[ samp['Genre']=='Rock' ])
len(metal), len(pop), len(rock)

(394076, 646907, 1897114)

In [None]:
hash_letter = enumLetters(metal)
hash_letter

# Create Data Set

In [14]:
len(hash_letter)

30

In [16]:
import numpy as np

def getDataSet(text, seq):
  n = len(text)
  X = []
  y = []
  for i in range(0, n - seq, 1):
	  sin  = text[i:i + seq]
	  out = text[i + seq]
	  X.append([hash_letter[c] for c in sin])
	  y.append(hash_letter[out])
  return X, y

def transform(dataX, dataY, seq):
  n = len(dataX)
  X = np.reshape(dataX, (n, seq, 1))
  X = X / float(30) 
  y = pd.get_dummies(dataY)
  return X, y

In [None]:
X1, y1 = getDataSet(metal, 100)

In [None]:
X1, y1 = transform(X1, y1, 100)
X1.shape, y1.shape

((385790, 100, 1), (385790, 30))

In [17]:
X2, y2 = getDataSet(pop, 128)

In [18]:
X2, y2 = transform(X2, y2, 128)
X2.shape, y2.shape

((646779, 128, 1), (646779, 30))

In [19]:
X3, y3 = getDataSet(rock, 128)

In [20]:
X3, y3 = transform(X3, y3, 128)
X3.shape, y3.shape

((1896986, 128, 1), (1896986, 30))

# General Functions

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

def create_model(shape1, shape2, shape3):
  model = Sequential()
  model.add(LSTM(256, input_shape=(shape1, shape2)))
  model.add(Dropout(0.2))
  model.add(Dense(30, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  return model

def create_model_complex(shape1, shape2, shape3):
  model = Sequential()
  model.add(LSTM(256, input_shape=(shape1, shape2), return_sequences=True))
  model.add(Dropout(0.2))
  model.add(LSTM(256))
  model.add(Dropout(0.2))
  model.add(Dense(shape3, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam')
  return model

# Lyric Traning: Metal

In [None]:
X1.shape[1], X1.shape[2], y1.shape[1]

(100, 1, 30)

In [None]:
model1 = create_model(X1.shape[1], X1.shape[2], y1.shape[1])
model1.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 30)                7710      
Total params: 271,902
Trainable params: 271,902
Non-trainable params: 0
_________________________________________________________________


In [None]:
model1.fit(X1, y1, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f6a5bdb6150>

In [None]:
model1.save('metal.h5')

In [None]:
from google.colab import files
files.download('metal.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Lyrics Traning: Pop

In [22]:
X2.shape[1], X2.shape[2], y2.shape[1]

(128, 1, 30)

In [23]:
model2 = create_model(X2.shape[1], X2.shape[2], y2.shape[1])
model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               264192    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 30)                7710      
Total params: 271,902
Trainable params: 271,902
Non-trainable params: 0
_________________________________________________________________


In [24]:
model2.fit(X2, y2, epochs=10, batch_size=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5ee2459d50>

In [25]:
model2.save('pop.h5')

In [26]:
from google.colab import files
files.download('pop.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Lyrics Traning: Rock

In [None]:
X3.shape[1], X3.shape[2], y3.shape[1]

In [None]:
model3 = create_model(X3.shape[1], X3.shape[2], y3.shape[1])
model3.summary()

In [None]:
model3.fit(X3, y3, epochs=10, batch_size=100)

In [None]:
model3.save('rock.h5')

In [None]:
from google.colab import files
files.download('rock.h5')