# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove other stop words.

In [1]:
# global assumption panel

## general
TRAIN_MODELS = True # if true, we add a lot of time

## data gathering
READ_CLEANED_OR_RAW = 'Clean' # ['Clean', 'Raw']
N_DATA_ROWS_PER_GENRE = 1000 # ['All', int]

## embedding
EMBED_STRATEGY = 'DistilBERT' # ['DistilBERT', 'GloVe']
MAX_GLOVE_LENGTH = 200 # [None, int]

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2
NUM_HEADS = 8 if EMBED_STRATEGY == 'DistilBERT' else 6
if EMBED_STRATEGY == 'GloVe':
    assert 300 % NUM_HEADS == 0
elif EMBED_STRATEGY == 'DistilBERT':
    assert 768 % NUM_HEADS == 0

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [2]:
# packages

## general use
from datetime import datetime

## torch
import torch

## project code
from project_code import data_gathering, genre_classification
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import nn_clf

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# more global assumptions
START_TIME = datetime.now()
print(f'Script Start Time = {START_TIME.strftime("%Y-%m-%d %H:%M:%S")}')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Script Start Time = 2025-03-27 13:52:50
Device = cpu


## Data Gathering

In [4]:
# read data
if TRAIN_MODELS:
    if READ_CLEANED_OR_RAW == 'Raw': 
        lyrics = data_gathering.read_and_clean_raw_lyrics(
            n_rows = 'All',
            exclude_non_english = True,
            resample_genres = True,
            save_data = True
        )
    elif READ_CLEANED_OR_RAW == 'Clean':
        lyrics, genre_map = data_gathering.read_cleaned_lyrics(
            n_rows_per_genre = N_DATA_ROWS_PER_GENRE
        )
else:
    genre_map = {0: 'country', 1: 'misc', 2: 'pop', 3: 'rap', 4: 'rb', 5: 'rock'}

Genre Counts Before Resampling:
	pop: 394195
	rap: 394195
	rock: 394195
	rb: 155082
	misc: 140986
	country: 86658

Genre Counts After Resampling:
	country: 1000
	misc: 1000
	pop: 1000
	rap: 1000
	rb: 1000
	rock: 1000

Cleaned Lyrics: Shape = (6000, 2)
	Columns = ['lyrics', 'genre']
Genre Mapping = {0: 'country', 1: 'misc', 2: 'pop', 3: 'rap', 4: 'rb', 5: 'rock'}


## Embedding

In [5]:
# generate embeddings using word2vec
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        lyrics_embed, glove_index = glove.embed_all_lyrics(
            data = lyrics,
            target_col = 'lyrics',
            custom_max_seq_len = MAX_GLOVE_LENGTH
        )
    elif EMBED_STRATEGY == 'DistilBERT':
        # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
        lyrics_embed = distilbert.embed_all_lyrics_v2(
            data = lyrics,
            target_col = 'lyrics',
            batch_size = BATCH_SIZE * 2
        )
else:
    if EMBED_STRATEGY == "GloVe":
        glove_index = glove.read_glove_embedding_index()

100%|██████████| 94/94 [31:38<00:00, 20.20s/it]


DistilBERT Embedded Lyrics: Shape = (n_songs, distilbert_embed_len) = torch.Size([6000, 768])


## Modeling

### Preprocessing

In [6]:
# create data loaders (train, val) and data sets (test)
if TRAIN_MODELS:
    lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
        data_embed = lyrics_embed,
        labels = lyrics['genre'],
        label_mapping = genre_map,
        val_pct = VAL_PCT,
        batch_size = BATCH_SIZE
    )

Train: 132 Batches of Size 32 For Training
Val: 29 Batches of Size 32 For Training
Test: 29 Batches of Size 32 For Final Eval


### Train - RNN Model

In [7]:
# define the baseline RNN Model
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        n_songs, max_seq_len, embed_dim = lyrics_embed.shape 
    elif EMBED_STRATEGY == 'DistilBERT':
        n_songs, embed_dim = lyrics_embed.shape
else:
    if EMBED_STRATEGY == 'GloVe':
        embed_dim = 300 
    elif EMBED_STRATEGY == 'DistilBert':
        embed_dim = 768

base_model = nn_clf.BaseRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(genre_map),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

BaseRNN(
  (rnn): GRU(768, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=6, bias=True)
  (softmax): Softmax(dim=-1)
)

In [8]:
# evaluate model performance - pre training
if TRAIN_MODELS:
    pre_train_acc = training.evaluate_nn_model_against_test_set(
        model = base_model,
        test_dataset = lyrics_test
    )

Model Accuracy = 15.33%


In [9]:
# train the model
if TRAIN_MODELS:
    training.nn_training(
        model = base_model,
        train_loader = lyrics_train,
        val_loader = lyrics_val,
        embed_strategy = EMBED_STRATEGY,
        learning_rate = LEARNING_RATE,
        num_epochs = NUM_EPOCHS,
        patience = PATIENCE,
        verbose = True,
        print_every = 1
    )
else:
    state_dict = torch.load(f'models/{EMBED_STRATEGY}_BaseRNN_Trained.pth', map_location = torch.device(DEVICE))
    base_model.load_state_dict(state_dict)

100%|██████████| 132/132 [00:04<00:00, 28.85it/s]


[1 / 50] Train Loss = 1.3531, Val Loss = 1.3235 **New Best Model**


100%|██████████| 132/132 [00:03<00:00, 35.48it/s]


[2 / 50] Train Loss = 1.1806, Val Loss = 1.2733 **New Best Model**


100%|██████████| 132/132 [00:02<00:00, 46.90it/s]


[3 / 50] Train Loss = 1.1326, Val Loss = 1.1914 **New Best Model**


100%|██████████| 132/132 [00:02<00:00, 48.39it/s]


[4 / 50] Train Loss = 1.1042, Val Loss = 1.2147


100%|██████████| 132/132 [00:02<00:00, 53.95it/s]


[5 / 50] Train Loss = 1.0878, Val Loss = 1.1832 **New Best Model**


100%|██████████| 132/132 [00:02<00:00, 55.79it/s]


[6 / 50] Train Loss = 1.0732, Val Loss = 1.1697 **New Best Model**


100%|██████████| 132/132 [00:02<00:00, 60.06it/s]


[7 / 50] Train Loss = 1.0567, Val Loss = 1.1119 **New Best Model**


100%|██████████| 132/132 [00:02<00:00, 58.94it/s]


[8 / 50] Train Loss = 1.0415, Val Loss = 1.1613


100%|██████████| 132/132 [00:02<00:00, 54.86it/s]


[9 / 50] Train Loss = 1.0109, Val Loss = 1.1491


100%|██████████| 132/132 [00:02<00:00, 49.99it/s]


[10 / 50] Train Loss = 0.9914, Val Loss = 1.2187


100%|██████████| 132/132 [00:02<00:00, 54.99it/s]


[11 / 50] Train Loss = 0.9884, Val Loss = 1.1425


100%|██████████| 132/132 [00:02<00:00, 59.92it/s]


[12 / 50] Train Loss = 0.9664, Val Loss = 1.2277
Early Stopping Triggered. Training Stopped.
	Best Epoch = 6, Best Val Loss = 1.1118628223394524


In [10]:
# evaluate model performance - post traing
if TRAIN_MODELS:
    post_train_acc = training.evaluate_nn_model_against_test_set(
        model = base_model,
        test_dataset = lyrics_test
    )

    print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

Model Accuracy = 57.89%
Training Improvement On Accuracy = +42.56%


### Train - Homemade Transformer

In [11]:
# define the transformer model
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        n_songs, max_seq_len, embed_dim = lyrics_embed.shape 
    elif EMBED_STRATEGY == 'DistilBERT':
        n_songs, embed_dim = lyrics_embed.shape
else:
    if EMBED_STRATEGY == 'Glove':
        embed_dim = 300 
    elif EMBED_STRATEGY == 'DistilBERT':
        embed_dim = 768
    
transformer_model = nn_clf.DeepTransformer(
    input_dim = embed_dim,
    num_heads = NUM_HEADS,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(genre_map),
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
transformer_model

DeepTransformer(
  (attention_layers): ModuleList(
    (0-1): 2 x MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
    )
  )
  (ffn_layers): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=768, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=768, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
  )
  (norm_layers_attn): ModuleList(
    (0-1): 2 x LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (norm_layers_ffn): ModuleList(
    (0-1): 2 x LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (fc): Linear(in_features=768, out_features=6, bias=True)
)

In [12]:
# evaluate model performance - pre training
if TRAIN_MODELS:
    pre_train_acc = training.evaluate_nn_model_against_test_set(
        model = transformer_model,
        test_dataset = lyrics_test
    )

Model Accuracy = 14.22%


In [13]:
# train the model
if TRAIN_MODELS:
    training.nn_training(
        model = transformer_model,
        train_loader = lyrics_train,
        val_loader = lyrics_val,
        embed_strategy = EMBED_STRATEGY,
        learning_rate = LEARNING_RATE,
        num_epochs = NUM_EPOCHS,
        patience = PATIENCE,
        verbose = True,
        print_every = 1
    )
else:
    state_dict = torch.load(f'models/{EMBED_STRATEGY}_DeepTransformer_Trained.pth', map_location = torch.device(DEVICE))
    transformer_model.load_state_dict(state_dict)

100%|██████████| 132/132 [00:04<00:00, 27.26it/s]


[1 / 50] Train Loss = 1.6377, Val Loss = 1.4292 **New Best Model**


100%|██████████| 132/132 [00:05<00:00, 25.19it/s]


[2 / 50] Train Loss = 1.2864, Val Loss = 1.3281 **New Best Model**


100%|██████████| 132/132 [00:05<00:00, 26.13it/s]


[3 / 50] Train Loss = 1.2624, Val Loss = 1.3525


100%|██████████| 132/132 [00:05<00:00, 26.09it/s]


[4 / 50] Train Loss = 1.1947, Val Loss = 1.4848


100%|██████████| 132/132 [00:05<00:00, 25.65it/s]


[5 / 50] Train Loss = 1.1973, Val Loss = 1.3189 **New Best Model**


100%|██████████| 132/132 [00:05<00:00, 26.11it/s]


[6 / 50] Train Loss = 1.2262, Val Loss = 1.3393


100%|██████████| 132/132 [00:04<00:00, 27.31it/s]


[7 / 50] Train Loss = 1.1413, Val Loss = 1.2904 **New Best Model**


100%|██████████| 132/132 [00:04<00:00, 26.47it/s]


[8 / 50] Train Loss = 1.1669, Val Loss = 1.3966


100%|██████████| 132/132 [00:05<00:00, 25.54it/s]


[9 / 50] Train Loss = 1.1247, Val Loss = 1.3619


100%|██████████| 132/132 [00:05<00:00, 25.93it/s]


[10 / 50] Train Loss = 1.1237, Val Loss = 1.3185


100%|██████████| 132/132 [00:05<00:00, 23.71it/s]


[11 / 50] Train Loss = 1.0951, Val Loss = 1.3024


100%|██████████| 132/132 [00:06<00:00, 21.37it/s]


[12 / 50] Train Loss = 1.1034, Val Loss = 1.2904 **New Best Model**


100%|██████████| 132/132 [00:04<00:00, 27.25it/s]


[13 / 50] Train Loss = 1.0904, Val Loss = 1.3268


100%|██████████| 132/132 [00:05<00:00, 25.38it/s]


[14 / 50] Train Loss = 1.1054, Val Loss = 1.4793


100%|██████████| 132/132 [00:04<00:00, 26.49it/s]


[15 / 50] Train Loss = 1.0618, Val Loss = 1.3699


100%|██████████| 132/132 [00:05<00:00, 26.21it/s]


[16 / 50] Train Loss = 1.0715, Val Loss = 1.3390


100%|██████████| 132/132 [00:07<00:00, 18.38it/s]


[17 / 50] Train Loss = 1.1030, Val Loss = 1.3846
Early Stopping Triggered. Training Stopped.
	Best Epoch = 11, Best Val Loss = 1.290380128498735


In [14]:
# evaluate model performance - post traing
if TRAIN_MODELS:
    post_train_acc = training.evaluate_nn_model_against_test_set(
        model = transformer_model,
        test_dataset = lyrics_test
    )

    print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

Model Accuracy = 51.78%
Training Improvement On Accuracy = +37.56%


## Custom Lyric Genre Prediction

In [15]:
# test lyrics
pop_test = """"
One. Don't pick up the phone. You know he's only calling cause he's drunk and alone.
Two. Don't let him in. You'll have to kick him out again.
Three. Don't be a friend. Cause you know you'll only wake up in his bed in the morning.
Cause if you're under him. You're not getting over him.
"""

rock_test = """
We come from the land of the ice and snow
From the midnight sun where the hot springs blow
The hammer of the gods will drive our ships to new lands
To fight the horde, singing and crying: Valhalla, I am coming!
On we sweep with threshing oar
Our only goal will be the western shore
"""

In [16]:
# make a prediction for a custom song
if EMBED_STRATEGY == 'DistilBERT':
    genre_classification.distilerbert_clf_prediction(
        lyrics = rock_test,
        clf_model = transformer_model, # base_model, transformer_model
        label_mapping = genre_map,
        device = DEVICE
    )
elif EMBED_STRATEGY == 'GloVe':
    genre_classification.glove_clf_prediction(
        lyrics = rock_test, # pop_test, rock_test
        clf_model = transformer_model,
        glove_index = glove_index,
        label_mapping = genre_map,
        max_seq_len = MAX_GLOVE_LENGTH,
        device = DEVICE
    )

Lyrics:

We come from the land of the ice and snow
From the midnight sun where the hot springs blow
The hammer of the gods will drive our ships to new lands
To fight the horde, singing and crying: Valhalla, I am coming!
On we sweep with threshing oar
Our only goal will be the western shore


Predicted Genre: rock (idx = 5)


In [17]:
# wrap up
END_TIME = datetime.now()
SCRIPT_TIME = (END_TIME - START_TIME).seconds
print(f'Script End Time = {END_TIME.strftime("%Y-%m-%d %H:%M:%S")}')
print(f'Duration = {SCRIPT_TIME / 60:.2f}min')

Script End Time = 2025-03-27 14:27:01
Duration = 34.17min
