# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove custom stop words like "CHORUS:" or "INTRO:"  
2. Remove other stop words.

In [1]:
# global assumption panel

## data gathering
N_DATA_ROWS = 10000 # Use -1 to retrieve all rows

## embedding
EMBED_STRATEGY = 'DistilBERT' # ['DistilBERT', 'GloVe']

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [2]:
# packages

## torch
import torch

## project code
from project_code import data_gathering
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import simple_rnn

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# more global assumptions
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Device = cpu


## Data Gathering

In [None]:
# read data and perform basic cleaning operations
# lyrics = data_gathering.read_lyrics(n_rows = N_DATA_ROWS)
lyrics = data_gathering.read_lyrics_dask(
    exclude_non_english = True,
    resample_genres = True,
    save_data = True
)

Lyrics Dataset: Shape = (10000, 13)
	Columns = ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language', 'cleaned_lyrics', 'tokenized_text']


In [24]:
Counter(lyrics.language)

Counter({'en': 9778, 'fr': 176, nan: 22, 'pt': 15, 'de': 4, 'es': 4, 'zh': 1})

## Embedding

In [5]:
# generate embeddings using word2vec
# if EMBED_STRATEGY == 'Word2Vec':
#     embedding_model = word2vec.apply_word2vec(lyrics['tokenized_text'].tolist())
if EMBED_STRATEGY == 'GloVe':
    lyrics_embed = glove.create_glove_matrix(data = lyrics, target_col = 'cleaned_lyrics')
elif EMBED_STRATEGY == 'DistilBERT':
    # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
    lyrics_embed = distilbert.embed_all_lyrics_v2(
        data = lyrics,
        target_col = 'lyrics',
        batch_size = BATCH_SIZE * 2
    )

100%|██████████| 157/157 [54:08<00:00, 20.69s/it]


DistilBERT Embedded Lyrics: torch.Size([10000, 768])


## Modeling

## Preprocessing

In [23]:
from collections import Counter
import numpy as np
genres = dict(Counter(lyrics.tag))
np.median(np.array(list(genres.values())))

143.5

In [7]:
# create data loaders (train, val) and data sets (test)
lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
    data_embed = lyrics_embed,
    labels = lyrics['tag'],
    val_pct = VAL_PCT,
    batch_size = BATCH_SIZE
)

Train: 219 Batches of Size 32 For Training
Val: 47 Batches of Size 32 For Training
Test: 1500 Records Withheld for Model Evaluation


  X_tensor = torch.tensor(data_embed)


## Training

In [8]:
# define the hierarchical attention model
n_songs, embed_dim = lyrics_embed.shape
base_model = simple_rnn.SimpleRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(lyrics['tag'].unique()),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

SimpleRNN(
  (rnn): GRU(768, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=6, bias=True)
  (softmax): Softmax(dim=-1)
)

In [9]:
# train the model
training.nn_training(
    model = base_model,
    train_loader = lyrics_train,
    val_loader = lyrics_val,
    learning_rate = LEARNING_RATE,
    num_epochs = NUM_EPOCHS,
    patience = PATIENCE,
    verbose = True,
    print_every = 1
)

100%|██████████| 219/219 [00:06<00:00, 33.50it/s]


[1 / 50] Train Loss = 1.1025, Val Loss = 1.0921


100%|██████████| 219/219 [00:04<00:00, 47.68it/s]


[2 / 50] Train Loss = 1.0948, Val Loss = 1.0922


100%|██████████| 219/219 [00:03<00:00, 56.58it/s]


[3 / 50] Train Loss = 1.0948, Val Loss = 1.0923


100%|██████████| 219/219 [00:04<00:00, 48.15it/s]


[4 / 50] Train Loss = 1.0947, Val Loss = 1.0922


100%|██████████| 219/219 [00:04<00:00, 54.13it/s]


[5 / 50] Train Loss = 1.0947, Val Loss = 1.0922


100%|██████████| 219/219 [00:03<00:00, 56.15it/s]


[6 / 50] Train Loss = 1.0947, Val Loss = 1.0922
Early Stopping Triggered. Training Stopped.
	Best Epoch = 0, Best Val Loss = 1.092128852580456
