In [1]:
import pickle
import numpy as np
import collections

from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

Using TensorFlow backend.


# Let's explore the data!

In [2]:
krithi_words = pickle.load( open( "composer_lyrics.pkl", "rb" ) )

In [3]:
# number of unique words as scraped
print krithi_words.keys()
for k in krithi_words.keys():
    print k
    print len(list(krithi_words[k]))

['Thyagaraja', 'Ramadasu', 'ShyamaShaastri', 'Annamayya']
Thyagaraja
2366
Ramadasu
259
ShyamaShaastri
337
Annamayya
165


In [4]:
# number of unique words after removing case sensitivity
grouped_lyrics = {
    "Thyagaraja":collections.Counter(),
    "ShyamaShaastri":collections.Counter(),
    "Ramadasu":collections.Counter(),
    "Annamayya":collections.Counter()
}

for k in krithi_words.keys():
    grouped_lyrics[k].update([w.lower() for w in krithi_words[k].elements()])
    
for k in grouped_lyrics.keys():
    print "Composer: ", k
    print grouped_lyrics[k].most_common(10)

Composer:  Thyagaraja
[(u'rama', 230), (u'shri', 101), (u'ni', 69), (u'manasa', 60), (u'o', 58), (u'na', 33), (u'ne', 33), (u'pahi', 32), (u'mam', 23), (u'daya', 20)]
Composer:  Ramadasu
[(u'rama', 35), (u'rara', 8), (u'shri', 8), (u'o', 8), (u'ni', 8), (u'ho', 7), (u'na', 7), (u'pahi', 6), (u'sitarama', 5), (u'nanu', 5)]
Composer:  Annamayya
[(u'namo', 6), (u'tandanana', 4), (u'vijayeebhava', 3), (u'narayana', 3), (u'bhala', 2), (u'govinda', 2), (u'brahma', 2), (u'twameva', 2), (u'shriman', 2), (u'sharanu', 2)]
Composer:  ShyamaShaastri
[(u'amba', 16), (u'nannu', 13), (u'shri', 11), (u'kamaksi', 11), (u'brova', 10), (u'devi', 9), (u'ninnu', 8), (u'ni', 7), (u'na', 6), (u'brovu', 5)]


# Create data matrix

(See separate script)

In [5]:
# create the vocabulary dict
top_num = 20

vocab_list = []
for k in grouped_lyrics.keys():
    vocab_list += [x for (x,y) in grouped_lyrics[k].most_common(top_num)]
    print k
    print len(vocab_list)
    
vocab_dict = {}
for i, w in enumerate(list(set(vocab_list))):
    vocab_dict[w] = i

print len(vocab_dict.keys())
pickle.dump(vocab_dict, open("vocab_dict.pkl", "wb") )

Thyagaraja
20
Ramadasu
40
Annamayya
60
ShyamaShaastri
80
62


In [6]:
# load pickled input matrices
X = np.load("X_62.dat")
Y = np.load("Y_62.dat")
unique, counts = np.unique(Y,return_counts=True)
counts

array([710,  79,  56,  35])

In [7]:
# what happens if we delete rows corresponding to songs that had no words in the vocab dictionary?

row_idxs_to_delete = []

for row in range(X.shape[0]):
    if np.all(X[row, :] == 0):
        row_idxs_to_delete.append(row)
        
X_cleaned = np.delete(X, row_idxs_to_delete, axis=0)
Y_cleaned = np.delete(Y, row_idxs_to_delete, axis=0)
unique, counts = np.unique(Y_cleaned,return_counts=True)
print counts
num_T, num_S, num_R, num_A = counts        


[449  62  49  17]


In [8]:
# randomly choose 80 from the massive thyagaraja, preserve the rest (serious class imbalance but meh)
X_balance = X_cleaned.copy()
Y_balance = Y_cleaned.copy()
del_idxs = np.random.choice(num_T, num_T - 80, replace=False)
X_balance = np.delete(X_balance, del_idxs, axis=0)
Y_balance = np.delete(Y_balance, del_idxs, axis=0)
unique, counts = np.unique(Y_balance,return_counts=True)
counts

array([80, 62, 49, 17])

In [9]:
# turn y_balance into a one-hot encoding
# print Y_balance

Y_balance_encoded = keras.utils.to_categorical(Y_balance, num_classes = 4)
# print Y_balance_encoded

X_balance is now (example_songs, frequency-in-vocab-dict) ; vocab dict is defined by top 20 words from each composer

Y_balance_encoded is now (example_songs,4) ; ont hot encoded class (composer) labels

In [21]:
# split into test and train sets

X_train, X_test, Y_train, Y_test = train_test_split(X_balance, Y_balance_encoded, test_size=0.10, random_state=42, shuffle=True)

In [22]:
print X_train.shape
print Y_train.shape
print X_test.shape
print Y_test.shape

(187, 62)
(187, 4)
(21, 62)
(21, 4)


In [23]:
model = Sequential()
model.add(Dense(128, input_dim=62, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train,epochs=200,batch_size=1)
              

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epo

Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7f3d64fb9d10>

In [24]:
model.evaluate(X_test, Y_test)



[3.6913652420043945, 0.61904764175415039]

In [25]:
Y_predictions = model.predict(X_test)

In [26]:
print vocab_dict

{u'pahimam': 0, u'shriman': 1, u'mayamma': 2, u'bhala': 3, u'vandita': 5, u'sita': 6, u'ramayya': 7, u'sitarama': 8, u'shri': 9, u'govinda': 10, u'sharanu': 59, u'dayaradura': 15, u'shringara': 12, u'muddugare': 13, u'brahma': 14, u'purusa': 11, u'vijayeebhava': 16, u'namo': 56, u'pahi': 17, u'brovu': 18, u'kadigina': 30, u'appuleni': 20, u'ni': 21, u'manasa': 22, u'rada': 23, u'ramamayam': 24, u'bhavani': 4, u'velalanu': 19, u'ne': 27, u'ramacandra': 28, u'gati': 37, u'twameva': 31, u'bhavamulona': 29, u'vegame': 33, u'nanu': 34, u'amba': 35, u'dayalo': 36, u'ninnu': 38, u'sharanam': 42, u'narayana': 40, u'ceppave': 41, u'niku': 32, u'nera': 43, u'prabho': 39, u'nannu': 46, u'mam': 47, u'brovumu': 48, u'rara': 49, u'nama': 50, u'brova': 51, u'kamaksi': 52, u'devi': 53, u'tandanana': 57, u'emi': 54, u'o': 55, u'janani': 44, u'nammiti': 25, u'dem': 58, u'na': 26, u'daya': 61, u'rama': 60, u'ho': 45}


In [27]:
composer_id = {
    "Thyagaraja":0,
    "ShyamaShaastri":1,
    "Ramadasu":2,
    "Annamayya":3
    }

flipped_vocab_dict =  dict((v,k) for k,v in vocab_dict.iteritems())
flipped_composer_dict =  dict((v,k) for k,v in composer_id.iteritems())

for i in range(X_test.shape[0]):
    print "~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Words in Vocabulary were:"
    for m in range(X_test.shape[1]):
        if X_test[i,m] > 0:
            print flipped_vocab_dict[m],
    
    print
    print "Correct composer is:", flipped_composer_dict[np.argmax(Y_test[i,:])]
    print "Prediction was:", flipped_composer_dict[np.argmax(np.rint(Y_predictions[i,:]))]
    print "~~~~~~~~~~~~~~~~~~~~~~~~"
    

~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
pahimam
Correct composer is: ShyamaShaastri
Prediction was: ShyamaShaastri
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
ne
Correct composer is: Thyagaraja
Prediction was: Thyagaraja
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
manasa
Correct composer is: Thyagaraja
Prediction was: Thyagaraja
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
ramayya na gati
Correct composer is: Ramadasu
Prediction was: Thyagaraja
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
kamaksi
Correct composer is: ShyamaShaastri
Prediction was: ShyamaShaastri
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
rama
Correct composer is: Thyagaraja
Prediction was: Ramadasu
~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~
Words in Vocabulary were:
na o rama
Correct composer is: Ramadasu
Prediction was: Ramadasu
~~~~~~~~~~~~~~~