# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove custom stop words like "CHORUS:" or "INTRO:"  
2. Remove other stop words.

In [10]:
# global assumption panel

## data gathering
N_DATA_ROWS = 10000 # Use -1 to retrieve all rows

## embedding
EMBED_STRATEGY = 'DistilBERT' # ['DistilBERT', 'GloVe']

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [11]:
# packages

## torch
import torch

## project code
from project_code import data_gathering
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import simple_rnn

In [12]:
# more global assumptions
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Device = cpu


## Data Gathering

In [13]:
# read data and perform basic cleaning operations
lyrics = data_gathering.read_lyrics(n_rows = N_DATA_ROWS)

Lyrics Dataset: Shape = (10000, 13)
	Columns = ['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language', 'cleaned_lyrics', 'tokenized_text']


## Embedding

In [None]:
# generate embeddings using word2vec
# if EMBED_STRATEGY == 'Word2Vec':
#     embedding_model = word2vec.apply_word2vec(lyrics['tokenized_text'].tolist())
if EMBED_STRATEGY == 'GloVe':
    lyrics_embed = glove.create_glove_matrix(data = lyrics, target_col = 'cleaned_lyrics')
elif EMBED_STRATEGY == 'DistilBERT':
    # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
    lyrics_embed = distilbert.embed_all_lyrics_v2(
        data = lyrics,
        target_col = 'lyrics',
        batch_size = BATCH_SIZE * 2
    )

  2%|▏         | 3/157 [00:59<51:49, 20.19s/it]

## Modeling

## Preprocessing

In [None]:
from collections import Counter
Counter(lyrics.tag)

Counter({'rap': 149, 'rb': 1})

In [None]:
# create data loaders (train, val) and data sets (test)
lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
    data_embed = lyrics_embed,
    labels = lyrics['tag'],
    val_pct = VAL_PCT,
    batch_size = BATCH_SIZE
)

Train: 4 Batches of Size 32 For Training
Val: 1 Batches of Size 32 For Training
Test: 22 Records Withheld for Model Evaluation


  X_tensor = torch.tensor(data_embed)


## Training

In [None]:
# define the hierarchical attention model
n_songs, embed_dim = lyrics_embed.shape
base_model = simple_rnn.SimpleRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(lyrics['tag'].unique()),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

SimpleRNN(
  (rnn): GRU(768, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=2, bias=True)
  (softmax): Softmax(dim=-1)
)

In [None]:
# train the model
training.nn_training(
    model = base_model,
    train_loader = lyrics_train,
    val_loader = lyrics_val,
    learning_rate = LEARNING_RATE,
    num_epochs = NUM_EPOCHS,
    patience = PATIENCE,
    verbose = True,
    print_every = 1
)

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 49.27it/s]


[1 / 50] Train Loss = 0.4600, Val Loss = 0.3150


100%|██████████| 4/4 [00:00<00:00, 58.32it/s]


[2 / 50] Train Loss = 0.3217, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 58.49it/s]


[3 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 58.20it/s]


[4 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 53.60it/s]


[5 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 28.19it/s]


[6 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 57.88it/s]


[7 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 59.15it/s]


[8 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 55.85it/s]


[9 / 50] Train Loss = 0.3383, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 50.58it/s]


[10 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 58.29it/s]


[11 / 50] Train Loss = 0.3211, Val Loss = 0.3133


100%|██████████| 4/4 [00:00<00:00, 59.62it/s]

[12 / 50] Train Loss = 0.3211, Val Loss = 0.3133
Early Stopping Triggered. Training Stopped.
	Best Epoch = 6, Best Val Loss = 0.31326180696487427



