# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove other stop words.

In [1]:
# global assumption panel

## general
TRAIN_MODELS = False # if true, we add a lot of time

## data gathering
READ_CLEANED_OR_RAW = 'Clean' # ['Clean', 'Raw']
N_DATA_ROWS_PER_GENRE = 1500 # Use -1 to retrieve all rows

## embedding
EMBED_STRATEGY = 'DistilBERT' # ['DistilBERT', 'GloVe']

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2
NUM_HEADS = 8
assert HIDDEN_SIZE % NUM_HEADS == 0

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [2]:
# packages

## general use
from datetime import datetime

## torch
import torch

## project code
from project_code import data_gathering, genre_classification
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import distilbert_clf

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# more global assumptions
START_TIME = datetime.now()
print(f'Script Start Time = {START_TIME.strftime("%Y-%m-%d %H:%M:%S")}')

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Script Start Time = 2025-03-26 14:42:16
Device = cpu


## Data Gathering

In [4]:
# read data
if TRAIN_MODELS:
    if READ_CLEANED_OR_RAW == 'Raw': 
        lyrics = data_gathering.read_and_clean_raw_lyrics(
            n_rows = 'All',
            exclude_non_english = True,
            resample_genres = True,
            save_data = True
        )
    elif READ_CLEANED_OR_RAW == 'Clean':
        lyrics, genre_map = data_gathering.read_cleaned_lyrics(
            n_rows_per_genre = N_DATA_ROWS_PER_GENRE
        )
else:
    genre_map = {0: 'country', 1: 'misc', 2: 'pop', 3: 'rap', 4: 'rb', 5: 'rock'}

## Embedding

In [5]:
# generate embeddings using word2vec
if TRAIN_MODELS:
    if EMBED_STRATEGY == 'GloVe':
        lyrics_embed = glove.create_glove_matrix(data = lyrics, target_col = 'cleaned_lyrics')
    elif EMBED_STRATEGY == 'DistilBERT':
        # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
        lyrics_embed = distilbert.embed_all_lyrics_v2(
            data = lyrics,
            target_col = 'lyrics',
            batch_size = BATCH_SIZE * 2
        )

## Modeling

### Preprocessing

In [6]:
# create data loaders (train, val) and data sets (test)
if TRAIN_MODELS:
    lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
        data_embed = lyrics_embed,
        labels = lyrics['genre'],
        label_mapping = genre_map,
        val_pct = VAL_PCT,
        batch_size = BATCH_SIZE
    )

### Train - RNN Model

In [7]:
# define the baseline RNN Model
if TRAIN_MODELS:
    n_songs, embed_dim = lyrics_embed.shape
else:
    embed_dim = 768

base_model = distilbert_clf.DistilBertRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(genre_map),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

DistilBertRNN(
  (rnn): GRU(768, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=6, bias=True)
  (softmax): Softmax(dim=-1)
)

In [8]:
# evaluate model performance - pre training
if TRAIN_MODELS:
    pre_train_acc = training.evaluate_nn_model_against_test_set(
        model = base_model,
        test_dataset = lyrics_test
    )

In [9]:
# train the model
if TRAIN_MODELS:
    training.nn_training(
        model = base_model,
        train_loader = lyrics_train,
        val_loader = lyrics_val,
        learning_rate = LEARNING_RATE,
        num_epochs = NUM_EPOCHS,
        patience = PATIENCE,
        verbose = True,
        print_every = 1
    )
else:
    state_dict = torch.load('models/MGC_DistilBertRNN.pth', map_location = torch.device(DEVICE))
    base_model.load_state_dict(state_dict)

In [10]:
# evaluate model performance - post traing
if TRAIN_MODELS:
    post_train_acc = training.evaluate_nn_model_against_test_set(
        model = base_model,
        test_dataset = lyrics_test
    )

    print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

### Train - Homemade Transformer

In [11]:
# define the transformer model
if TRAIN_MODELS:
    n_songs, embed_dim = lyrics_embed.shape
else:
    embed_dim = 768
    
transformer_model = distilbert_clf.DistilBertDeepTransformer(
    input_dim = embed_dim,
    num_heads = NUM_HEADS,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(genre_map),
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
transformer_model

DistilBertDeepTransformer(
  (attention_layers): ModuleList(
    (0-1): 2 x MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
    )
  )
  (ffn_layers): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=768, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=768, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
  )
  (norm_layers_attn): ModuleList(
    (0-1): 2 x LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (norm_layers_ffn): ModuleList(
    (0-1): 2 x LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (fc): Linear(in_features=768, out_features=6, bias=True)
)

In [12]:
# evaluate model performance - pre training
if TRAIN_MODELS:
    pre_train_acc = training.evaluate_nn_model_against_test_set(
        model = transformer_model,
        test_dataset = lyrics_test
    )

In [13]:
# train the model
if TRAIN_MODELS:
    training.nn_training(
        model = transformer_model,
        train_loader = lyrics_train,
        val_loader = lyrics_val,
        learning_rate = LEARNING_RATE,
        num_epochs = NUM_EPOCHS,
        patience = PATIENCE,
        verbose = True,
        print_every = 1
    )
else:
    state_dict = torch.load('models/MGC_DistilBertDeepTransformer.pth', map_location = torch.device(DEVICE))
    transformer_model.load_state_dict(state_dict)

In [14]:
# evaluate model performance - post traing
if TRAIN_MODELS:
    post_train_acc = training.evaluate_nn_model_against_test_set(
        model = transformer_model,
        test_dataset = lyrics_test
    )

    print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

## Custom Lyric Genre Prediction

In [15]:
# test lyrics
pop_test = """"
One. Don't pick up the phone. You know he's only calling cause he's drunk and alone.
Two. Don't let him in. You'll have to kick him out again.
Three. Don't be a friend. Cause you know you'll only wake up in his bed in the morning.
Cause if you're under him. You're not getting over him.
"""

rock_test = """
We come from the land of the ice and snow
From the midnight sun where the hot springs blow
The hammer of the gods will drive our ships to new lands
To fight the horde, singing and crying: Valhalla, I am coming!
On we sweep with threshing oar
Our only goal will be the western shore
"""

In [16]:
# make a prediction for a custom song
genre_classification.distilerbert_clf_prediction(
    lyrics = rock_test,
    clf_model = transformer_model, # base_model, transformer_model
    label_mapping = genre_map,
    device = DEVICE
)

Lyrics:

We come from the land of the ice and snow
From the midnight sun where the hot springs blow
The hammer of the gods will drive our ships to new lands
To fight the horde, singing and crying: Valhalla, I am coming!
On we sweep with threshing oar
Our only goal will be the western shore


Predicted Genre: rock (idx = 5)


'rock'

In [17]:
# wrap up
END_TIME = datetime.now()
SCRIPT_TIME = (END_TIME - START_TIME).seconds
print(f'Script End Time = {END_TIME.strftime("%Y-%m-%d %H:%M:%S")}')
print(f'Duration = {SCRIPT_TIME / 60:.2f}min')

Script End Time = 2025-03-26 14:42:17
Duration = 0.00min
