# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove other stop words.

In [3]:
# global assumption panel

## general
TRAIN_MODELS = True # if true, we add a lot of time

## data gathering
READ_CLEANED_OR_RAW = 'Clean' # ['Clean', 'Raw']
N_DATA_ROWS_PER_GENRE = 50 # ['All', int]

## embedding
EMBED_STRATEGY = 'GloVe' # ['DistilBERT', 'GloVe']
MAX_GLOVE_LENGTH = 200 # [None, int]

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2
NUM_HEADS = 8 if EMBED_STRATEGY == 'DistilBERT' else 6
if EMBED_STRATEGY == 'GloVe':
    assert 300 % NUM_HEADS == 0
elif EMBED_STRATEGY == 'DistilBERT':
    assert 768 % NUM_HEADS == 0

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [4]:
# packages

## general use
from datetime import datetime

## torch
import torch

## project code
from project_code import data_gathering, genre_classification
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import nn_clf

  from .autonotebook import tqdm as notebook_tqdm





In [5]:
# more global assumptions
START_TIME = datetime.now()
print(f'Script Start Time = {START_TIME.strftime("%Y-%m-%d %H:%M:%S")}')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Script Start Time = 2025-03-30 13:46:25
Device = cpu


## Data Gathering

In [6]:
# read data
if TRAIN_MODELS:
    if READ_CLEANED_OR_RAW == 'Raw': 
        lyrics = data_gathering.read_and_clean_raw_lyrics(
            n_rows = 'All',
            exclude_non_english = True,
            resample_genres = True,
            save_data = True
        )
    elif READ_CLEANED_OR_RAW == 'Clean':
        lyrics, genre_map = data_gathering.read_cleaned_lyrics(
            n_rows_per_genre = N_DATA_ROWS_PER_GENRE
        )
else:
    genre_map = {0: 'country', 1: 'misc', 2: 'pop', 3: 'rap', 4: 'rb', 5: 'rock'}

Genre Counts Before Resampling:
	pop: 394195
	rap: 394195
	rock: 394195
	rb: 155082
	misc: 140986
	country: 86658

Genre Counts After Resampling:
	country: 50
	misc: 50
	pop: 50
	rap: 50
	rb: 50
	rock: 50

Cleaned Lyrics: Shape = (300, 2)
	Columns = ['lyrics', 'genre']
Genre Mapping = {0: 'country', 1: 'misc', 2: 'pop', 3: 'rap', 4: 'rb', 5: 'rock'}


## Embed + Neural Network

### Embedding

In [7]:
# generate embeddings using word2vec
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        lyrics_embed, glove_index = glove.embed_all_lyrics(
            data = lyrics,
            target_col = 'lyrics',
            custom_max_seq_len = MAX_GLOVE_LENGTH
        )
    elif EMBED_STRATEGY == 'DistilBERT':
        # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
        lyrics_embed = distilbert.embed_all_lyrics_v2(
            data = lyrics,
            target_col = 'lyrics',
            batch_size = BATCH_SIZE * 2
        )
else:
    if EMBED_STRATEGY == "GloVe":
        glove_index = glove.read_glove_embedding_index()

Extracting GloVe Embedding Index...


1917494it [01:51, 17182.31it/s]



Converting Word Indices to GloVe Vectors...


100%|██████████| 300/300 [00:00<00:00, 3758.00it/s]



GloVe Embedded Lyrics: Shape = (n_songs, max_seq_len, embed_len) = torch.Size([300, 200, 300])
	Padded Sequences: Shape = (n_songs, max_seq_len) = (300, 200)


### Preprocessing

In [8]:
# create data loaders (train, val) and data sets (test)
if TRAIN_MODELS:
    lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
        data_embed = lyrics_embed,
        labels = lyrics['genre'],
        label_mapping = genre_map,
        val_pct = VAL_PCT,
        batch_size = BATCH_SIZE
    )

Train: 7 Batches of Size 32 For Training
Val: 2 Batches of Size 32 For Training
Test: 2 Batches of Size 32 For Final Eval


### Train - RNN Model

In [9]:
# define the baseline RNN Model
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        n_songs, max_seq_len, embed_dim = lyrics_embed.shape 
    elif EMBED_STRATEGY == 'DistilBERT':
        n_songs, embed_dim = lyrics_embed.shape
else:
    if EMBED_STRATEGY == 'GloVe':
        embed_dim = 300 
    elif EMBED_STRATEGY == 'DistilBert':
        embed_dim = 768

base_model = nn_clf.BaseRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(genre_map),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

BaseRNN(
  (rnn): GRU(300, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=6, bias=True)
  (softmax): Softmax(dim=-1)
)

In [10]:
# evaluate model performance - pre training
if TRAIN_MODELS:
    pre_train_acc = training.evaluate_nn_model_against_test_set(
        model = base_model,
        test_dataset = lyrics_test
    )

Model Accuracy = 17.78%


In [11]:
# train the model
if TRAIN_MODELS:
    training.nn_training(
        model = base_model,
        train_loader = lyrics_train,
        val_loader = lyrics_val,
        embed_strategy = EMBED_STRATEGY,
        learning_rate = LEARNING_RATE,
        num_epochs = NUM_EPOCHS,
        patience = PATIENCE,
        verbose = True,
        print_every = 1
    )
else:
    state_dict = torch.load(f'models/{EMBED_STRATEGY}_BaseRNN_Trained.pth', map_location = torch.device(DEVICE))
    base_model.load_state_dict(state_dict)

100%|██████████| 7/7 [00:23<00:00,  3.41s/it]


[1 / 50] Train Loss = 1.7737, Val Loss = 1.6382 **New Best Model**


100%|██████████| 7/7 [00:12<00:00,  1.77s/it]


[2 / 50] Train Loss = 1.5255, Val Loss = 1.5224 **New Best Model**


100%|██████████| 7/7 [00:05<00:00,  1.19it/s]


[3 / 50] Train Loss = 1.3447, Val Loss = 1.4277 **New Best Model**


100%|██████████| 7/7 [00:06<00:00,  1.04it/s]


[4 / 50] Train Loss = 1.1169, Val Loss = 1.5077


100%|██████████| 7/7 [00:05<00:00,  1.23it/s]


[5 / 50] Train Loss = 0.7902, Val Loss = 1.5509


100%|██████████| 7/7 [00:05<00:00,  1.34it/s]


[6 / 50] Train Loss = 0.5466, Val Loss = 1.8882


100%|██████████| 7/7 [00:05<00:00,  1.25it/s]


[7 / 50] Train Loss = 0.4289, Val Loss = 2.3384


100%|██████████| 7/7 [00:06<00:00,  1.14it/s]


[8 / 50] Train Loss = 0.2944, Val Loss = 2.3542
Early Stopping Triggered. Training Stopped.
	Best Epoch = 2, Best Val Loss = 1.4277328848838806


In [12]:
# evaluate model performance - post traing
if TRAIN_MODELS:
    post_train_acc = training.evaluate_nn_model_against_test_set(
        model = base_model,
        test_dataset = lyrics_test
    )

    print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

Model Accuracy = 22.22%
Training Improvement On Accuracy = +4.44%


### Train - Homemade Transformer

In [13]:
# define the transformer model
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        n_songs, max_seq_len, embed_dim = lyrics_embed.shape 
    elif EMBED_STRATEGY == 'DistilBERT':
        n_songs, embed_dim = lyrics_embed.shape
else:
    if EMBED_STRATEGY == 'Glove':
        embed_dim = 300 
    elif EMBED_STRATEGY == 'DistilBERT':
        embed_dim = 768
    
transformer_model = nn_clf.DeepTransformer(
    input_dim = embed_dim,
    num_heads = NUM_HEADS,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(genre_map),
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
transformer_model

DeepTransformer(
  (attention_layers): ModuleList(
    (0-1): 2 x MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
    )
  )
  (ffn_layers): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=300, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
  )
  (norm_layers_attn): ModuleList(
    (0-1): 2 x LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  )
  (norm_layers_ffn): ModuleList(
    (0-1): 2 x LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  )
  (fc): Linear(in_features=300, out_features=6, bias=True)
)

In [14]:
# evaluate model performance - pre training
if TRAIN_MODELS:
    pre_train_acc = training.evaluate_nn_model_against_test_set(
        model = transformer_model,
        test_dataset = lyrics_test
    )

Model Accuracy = 13.33%


In [15]:
# train the model
if TRAIN_MODELS:
    training.nn_training(
        model = transformer_model,
        train_loader = lyrics_train,
        val_loader = lyrics_val,
        embed_strategy = EMBED_STRATEGY,
        learning_rate = LEARNING_RATE,
        num_epochs = NUM_EPOCHS,
        patience = PATIENCE,
        verbose = True,
        print_every = 1
    )
else:
    state_dict = torch.load(f'models/{EMBED_STRATEGY}_DeepTransformer_Trained.pth', map_location = torch.device(DEVICE))
    transformer_model.load_state_dict(state_dict)

100%|██████████| 7/7 [00:03<00:00,  2.24it/s]


[1 / 50] Train Loss = 2.2021, Val Loss = 1.9200 **New Best Model**


100%|██████████| 7/7 [00:02<00:00,  2.38it/s]


[2 / 50] Train Loss = 1.7065, Val Loss = 1.6690 **New Best Model**


100%|██████████| 7/7 [00:02<00:00,  2.36it/s]


[3 / 50] Train Loss = 1.4169, Val Loss = 1.4308 **New Best Model**


100%|██████████| 7/7 [00:02<00:00,  2.43it/s]


[4 / 50] Train Loss = 1.1635, Val Loss = 1.6385


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]


[5 / 50] Train Loss = 0.8595, Val Loss = 1.3617 **New Best Model**


100%|██████████| 7/7 [00:02<00:00,  2.58it/s]


[6 / 50] Train Loss = 0.6144, Val Loss = 1.4164


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]


[7 / 50] Train Loss = 0.3350, Val Loss = 1.8297


100%|██████████| 7/7 [00:02<00:00,  2.37it/s]


[8 / 50] Train Loss = 0.1675, Val Loss = 1.9894


100%|██████████| 7/7 [00:03<00:00,  2.30it/s]


[9 / 50] Train Loss = 0.0919, Val Loss = 1.8917


100%|██████████| 7/7 [00:03<00:00,  2.29it/s]


[10 / 50] Train Loss = 0.0538, Val Loss = 2.2654
Early Stopping Triggered. Training Stopped.
	Best Epoch = 4, Best Val Loss = 1.3617021441459656


In [16]:
# evaluate model performance - post traing
if TRAIN_MODELS:
    post_train_acc = training.evaluate_nn_model_against_test_set(
        model = transformer_model,
        test_dataset = lyrics_test
    )

    print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

Model Accuracy = 57.78%
Training Improvement On Accuracy = +44.44%


## Fine Tune GPT2

In [17]:
# create GPT2 Fine Tuning Datasets
lyrics_gpt_train, lyrics_gpt_val, lyrics_gpt_test, gpt_tokenizer = preprocessing.gpt2_create_datasets(
    data = lyrics, 
    label_mapping = genre_map,
    input_col = 'lyrics', label_col = 'genre',
    val_pct = VAL_PCT
)

Map: 100%|██████████| 300/300 [00:01<00:00, 239.60 examples/s]

Train: Length = 210
Val:   Length = 45
Test:  Length = 45





In [18]:
# fine tune the GPT2 model
gpt2_model_fine_tuned = training.gpt2_fine_tuning(
    train_dataset = lyrics_gpt_train,
    val_dataset = lyrics_gpt_val,
    test_dataset = lyrics_gpt_test,
    input_tokenizer = gpt_tokenizer,
    num_labels = len(genre_map),
    batch_size = 4, # BATCH_SIZE,
    num_epochs = 3, # NUM_EPOCHS,
    learning_rate = LEARNING_RATE
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=6, bias=False)
)
GPT2 Fine Tuning: Start Time = 2025-03-30 13:

Epoch,Training Loss,Validation Loss
1,1.8271,1.777803
2,1.6554,1.588272
3,1.2989,1.54112


GPT2 Fine Tuning: Trained. Check the models subfolder for the trained model.
GPT2 Fine Tuning: End Time = 2025-03-30 13:50:35, Duration = 28.22min


GPT2 Fine Tuning: Test Performance...
	eval_loss: 1.537161111831665
	eval_runtime: 30.4086
	eval_samples_per_second: 1.48
	eval_steps_per_second: 0.395
	epoch: 3.0


## Custom Lyric Genre Prediction

In [19]:
# test lyrics
pop_test = """"
One. Don't pick up the phone. You know he's only calling cause he's drunk and alone.
Two. Don't let him in. You'll have to kick him out again.
Three. Don't be a friend. Cause you know you'll only wake up in his bed in the morning.
Cause if you're under him. You're not getting over him.
"""

rock_test = """
We come from the land of the ice and snow
From the midnight sun where the hot springs blow
The hammer of the gods will drive our ships to new lands
To fight the horde, singing and crying: Valhalla, I am coming!
On we sweep with threshing oar
Our only goal will be the western shore
"""

In [20]:
# make a prediction for a custom song
if EMBED_STRATEGY == 'DistilBERT':
    genre_classification.distilerbert_clf_prediction(
        lyrics = rock_test,
        clf_model = transformer_model, # base_model, transformer_model
        label_mapping = genre_map,
        device = DEVICE
    )
elif EMBED_STRATEGY == 'GloVe':
    genre_classification.glove_clf_prediction(
        lyrics = rock_test, # pop_test, rock_test
        clf_model = transformer_model,
        glove_index = glove_index,
        label_mapping = genre_map,
        max_seq_len = MAX_GLOVE_LENGTH,
        device = DEVICE
    )

Lyrics:

We come from the land of the ice and snow
From the midnight sun where the hot springs blow
The hammer of the gods will drive our ships to new lands
To fight the horde, singing and crying: Valhalla, I am coming!
On we sweep with threshing oar
Our only goal will be the western shore


Predicted Genre: misc (idx = 1)


In [21]:
# wrap up
END_TIME = datetime.now()
SCRIPT_TIME = (END_TIME - START_TIME).seconds
print(f'Script End Time = {END_TIME.strftime("%Y-%m-%d %H:%M:%S")}')
print(f'Duration = {SCRIPT_TIME / 60:.2f}min')

Script End Time = 2025-03-30 14:19:20
Duration = 32.90min
