In [35]:
from sklearn.datasets import load_diabetes
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import torch
import nltk
import torch.nn as nn 
import json
import numpy as np
from torch.nn.utils.rnn import pad_sequence
import os

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/fjordrunner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [36]:
with open('songs_training_data.json', 'r') as fp:
    songs_training_data = json.load(fp)

In [37]:
print(len(songs_training_data))

32028


In [38]:
word_counts = Counter()

for k, v in songs_training_data.items():
    lyrics = v['lyrics']
    tokens = nltk.word_tokenize(lyrics)
    word_counts = word_counts + Counter(tokens)

shortened_word_count_info = word_counts.most_common(5000)

In [39]:
print(shortened_word_count_info)



In [40]:
# TODO: add [UNK] token
 
word_to_idx = {tup[0]: i for i, tup in enumerate(shortened_word_count_info)}
word_to_idx["<UNK>"] = len(shortened_word_count_info)

idx_to_word =  dict((v,k) for k,v in word_to_idx.items())
idx_to_word[len(shortened_word_count_info)] = "<UNK>"

In [41]:
print(len(word_to_idx))

5001


In [42]:
print(word_to_idx)



In [43]:
max_length = 0
for k, v in songs_training_data.items():
    lyrics = v['lyrics']
    tokens = nltk.word_tokenize(lyrics)
    # TODO: include the [UNK] token for words not seen in the corpus, (make and if else statement)
    numerical_form = []
    for word in tokens:
      if word in word_to_idx:
        numerical_form.append(word_to_idx[word])
      else:
        numerical_form.append(word_to_idx["<UNK>"])

        
    # numerical_form = [word_to_idx[word] for word in tokens if word in word_to_idx ]
    # print(len(numerical_form))

    songs_training_data[k]['tensor'] = torch.tensor(numerical_form)
    max_length = max(max_length, songs_training_data[k]['tensor'].size()[0])

In [44]:
print(len(shortened_word_count_info))

5000


In [45]:
print(len(songs_training_data))

32028


In [46]:
lyrics = [songs_training_data[k]['tensor'] for k in songs_training_data]
print(len(lyrics))

lyrics = pad_sequence(lyrics, batch_first= True, padding_value=len(shortened_word_count_info))

tags = np.array([songs_training_data[k]['tag_list'] for k in songs_training_data])

32028


In [47]:
tags_updated = tags.argmax(1)
tags_updated = np.where(tags_updated > 12, tags_updated - 1, tags_updated)
tags_updated.shape

(32028,)

In [48]:
class trainingdataset(Dataset):
    def __init__(self, lyrics, tags):
        self.x = torch.tensor(lyrics)
        self.y = torch.tensor(tags)
        self.length = self.x.shape[0]

    def __getitem__(self, index):
        return (self.x[index], self.y[index])
    
    def __len__(self):
        return self.length


training_dataset = trainingdataset(lyrics, tags_updated)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
class lyric_based_training_dataset(Dataset):
    def __init__(self, lyrics):
        self.x = torch.tensor(lyrics)
        self.length = self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index]
    
    def __len__(self):
        return self.length

training_dataset = lyric_based_training_dataset(lyrics)

In [49]:
len(training_dataset), training_dataset[0]

(32028, (tensor([ 893,    0,  265,  ..., 5000, 5000, 5000]), tensor(6)))

In [50]:
dataloader = DataLoader(dataset=training_dataset, shuffle=True, batch_size=1)

In [51]:
from itertools import chain
from sklearn.utils import class_weight

train_labels = list(tags_updated)
label_ids = np.unique(train_labels)
print(len(label_ids))

class_weights = class_weight.compute_class_weight('balanced', classes=label_ids, y=train_labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)
class_weights, len(class_weights)

13


(tensor([0.4856, 0.9490, 6.8246, 0.7632, 1.0806, 0.8715, 1.4859, 1.6414, 0.6530,
         1.4200, 2.2541, 1.3310, 0.6080]), 13)

In [52]:
print(label_ids)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12]


In [53]:
print(len(shortened_word_count_info))

5000


In [54]:
class net(nn.Module):
  def __init__(self, embedding_dim=500, hidden_dim=250, padding_idx=len(shortened_word_count_info),
               output_size=len(class_weights)):
    super(net,self).__init__()
    self.l1 = nn.Embedding(num_embeddings=len(shortened_word_count_info) + 1, 
                           embedding_dim=embedding_dim, padding_idx=padding_idx)
    nn.init.xavier_uniform_(self.l1.weight.data)
    self.l1.weight.data[padding_idx] = torch.zeros(embedding_dim)

    self.l2 = nn.Linear(embedding_dim, hidden_dim)
    nn.init.xavier_uniform_(self.l2.weight.data)

    self.relu = nn.GELU()
    self.l3 = nn.Linear(hidden_dim, output_size)

  def forward(self, x):
    # x: (b, L)
    # o: (b, L, e)
    output1 = self.l1(x) 
    # o: (b, e)
    # print(output.shape)
    # 500-dimensional
    output2 = torch.mean(output1, 1)
    # o: (b, h)
    #250-dimensional
    output3 = self.relu(self.l2(output2))
    # o: (b, o)
    output = self.l3(output3)
    # (b, 9)

    return output, output2, output3 

In [55]:
model = torch.load('model.pt', map_location=torch.device('cpu'))

In [56]:
mat_500 = []
mat_250 = []


for x_train, y_train in dataloader:
  _, _500_output, _250_output =  model(x_train.cpu())
  mat_500.append(_500_output)
  mat_250.append(_250_output)

In [62]:
for i, tens in enumerate(mat_500):
    mat_500[i] = tens.tolist()

In [65]:
for i, tens in enumerate(mat_250):
    mat_250[i] = tens.tolist()

In [63]:
# with open('./500_lyric_embeddngs.json', 'w') as fp:
#     json.dump(mat_500, fp)

In [66]:
# with open('./250_lyric_embeddngs.json', 'w') as fp:
#     json.dump(mat_250, fp)

In [58]:
print(len(mat_500))

32028


In [67]:
print(len(mat_250))

32028
