# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove custom stop words like "CHORUS:" or "INTRO:"  
2. Remove other stop words.

In [1]:
# global assumption panel

## data gathering
N_DATA_ROWS = 10000 # Use -1 to retrieve all rows

## embedding
EMBED_STRATEGY = 'DistilBERT' # ['DistilBERT', 'GloVe']

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [2]:
# packages

## torch
import torch

## project code
from project_code import data_gathering
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import simple_rnn

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# more global assumptions
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Device = cpu


## Data Gathering

In [4]:
# read data and perform basic cleaning operations
# lyrics = data_gathering.read_lyrics(n_rows = N_DATA_ROWS)
lyrics = data_gathering.read_lyrics_dask(
    block_size = '250MB',
    exclude_non_english = True,
    resample_genres = True,
    save_data = True
)

ValueError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: Integer column has NA values in column 3

## Embedding

In [None]:
# generate embeddings using word2vec
# if EMBED_STRATEGY == 'Word2Vec':
#     embedding_model = word2vec.apply_word2vec(lyrics['tokenized_text'].tolist())
if EMBED_STRATEGY == 'GloVe':
    lyrics_embed = glove.create_glove_matrix(data = lyrics, target_col = 'cleaned_lyrics')
elif EMBED_STRATEGY == 'DistilBERT':
    # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
    lyrics_embed = distilbert.embed_all_lyrics_v2(
        data = lyrics,
        target_col = 'lyrics',
        batch_size = BATCH_SIZE * 2
    )

## Modeling

### Preprocessing

In [None]:
from collections import Counter
import numpy as np
genres = dict(Counter(lyrics.genre))
np.median(np.array(list(genres.values())))

In [None]:
# create data loaders (train, val) and data sets (test)
lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
    data_embed = lyrics_embed,
    labels = lyrics['genre'],
    val_pct = VAL_PCT,
    batch_size = BATCH_SIZE
)

### Training

In [None]:
# define the hierarchical attention model
n_songs, embed_dim = lyrics_embed.shape
base_model = simple_rnn.SimpleRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(lyrics['genre'].unique()),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

In [None]:
# train the model
training.nn_training(
    model = base_model,
    train_loader = lyrics_train,
    val_loader = lyrics_val,
    learning_rate = LEARNING_RATE,
    num_epochs = NUM_EPOCHS,
    patience = PATIENCE,
    verbose = True,
    print_every = 1
)