# CSE-6242 - Team 157 - Group Project

__TODO:__  
1. Remove other stop words.

In [None]:
# global assumption panel

## data gathering
READ_CLEANED_OR_RAW = 'Clean' # ['Clean', 'Raw']
N_DATA_ROWS_PER_GENRE = 1500 # Use -1 to retrieve all rows

## embedding
EMBED_STRATEGY = 'DistilBERT' # ['DistilBERT', 'GloVe']

## modeling - preprocessing
VAL_PCT = 0.15 # the percent of data we want to withhold for testing
BATCH_SIZE = 32 # bigger means faster training, but more memory use

## modeling - architecture
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT = 0.2
NUM_HEADS = 8
assert HIDDEN_SIZE % NUM_HEADS == 0

## modeling - training
LEARNING_RATE = 0.001
NUM_EPOCHS = 50
PATIENCE = 5

In [2]:
# packages

## torch
import torch

## project code
from project_code import data_gathering, genre_classification
from embedding import distilbert, glove
from modeling import preprocessing, training
from architectures import distilbert_clf

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# more global assumptions
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device = {DEVICE}')

Device = cpu


## Data Gathering

In [4]:
# read data
if READ_CLEANED_OR_RAW == 'Raw': 
    lyrics = data_gathering.read_and_clean_raw_lyrics(
        n_rows = 'All',
        exclude_non_english = True,
        resample_genres = True,
        save_data = True
    )
elif READ_CLEANED_OR_RAW == 'Clean':
    lyrics, genre_map = data_gathering.read_cleaned_lyrics(
        n_rows_per_genre = N_DATA_ROWS_PER_GENRE
    )

Genre Counts Before Resampling:
	pop: 394195
	rap: 394195
	rock: 394195
	rb: 155082
	misc: 140986
	country: 86658

Genre Counts After Resampling:
	country: 100
	misc: 100
	pop: 100
	rap: 100
	rb: 100
	rock: 100

Cleaned Lyrics: Shape = (600, 2)
	Columns = ['lyrics', 'genre']
Genre Mapping = {0: 'country', 1: 'misc', 2: 'pop', 3: 'rap', 4: 'rb', 5: 'rock'}


## Embedding

In [5]:
# generate embeddings using word2vec
# if EMBED_STRATEGY == 'Word2Vec':
#     embedding_model = word2vec.apply_word2vec(lyrics['tokenized_text'].tolist())
if EMBED_STRATEGY == 'GloVe':
    lyrics_embed = glove.create_glove_matrix(data = lyrics, target_col = 'cleaned_lyrics')
elif EMBED_STRATEGY == 'DistilBERT':
    # lyrics_embed = distilbert.distilbert_embed_all_docs(data = lyrics, target_col = 'lyrics')
    lyrics_embed = distilbert.embed_all_lyrics_v2(
        data = lyrics,
        target_col = 'lyrics',
        batch_size = BATCH_SIZE * 2
    )

100%|██████████| 10/10 [02:21<00:00, 14.11s/it]

DistilBERT Embedded Lyrics: torch.Size([600, 768])





## Modeling

### Preprocessing

In [6]:
# create data loaders (train, val) and data sets (test)
lyrics_train, lyrics_val, lyrics_test = preprocessing.create_datasets(
    data_embed = lyrics_embed,
    labels = lyrics['genre'],
    label_mapping = genre_map,
    val_pct = VAL_PCT,
    batch_size = BATCH_SIZE
)

Train: 14 Batches of Size 32 For Training
Val: 3 Batches of Size 32 For Training
Test: 3 Batches of Size 32 For Final Eval


### Train - RNN Model

In [7]:
# define the baseline RNN Model
n_songs, embed_dim = lyrics_embed.shape
base_model = distilbert_clf.DistilBertRNN(
    input_dim = embed_dim,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(lyrics['genre'].unique()),
    type = 'GRU',
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
base_model

DistilBertRNN(
  (rnn): GRU(768, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=6, bias=True)
  (softmax): Softmax(dim=-1)
)

In [8]:
# evaluate model performance - pre training
pre_train_acc = training.evaluate_nn_model_against_test_set(
    model = base_model,
    test_dataset = lyrics_test
)

Model Accuracy = 20.00%


In [9]:
# train the model
training.nn_training(
    model = base_model,
    train_loader = lyrics_train,
    val_loader = lyrics_val,
    learning_rate = LEARNING_RATE,
    num_epochs = NUM_EPOCHS,
    patience = PATIENCE,
    verbose = True,
    print_every = 1
)

100%|██████████| 14/14 [00:00<00:00, 34.72it/s]


[1 / 50] Train Loss = 1.7501, Val Loss = 1.6802 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 57.02it/s]


[2 / 50] Train Loss = 1.6057, Val Loss = 1.5798 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 61.11it/s]


[3 / 50] Train Loss = 1.5430, Val Loss = 1.5617 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 60.05it/s]


[4 / 50] Train Loss = 1.4973, Val Loss = 1.5309 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 62.02it/s]


[5 / 50] Train Loss = 1.4754, Val Loss = 1.5175 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 62.75it/s]


[6 / 50] Train Loss = 1.4985, Val Loss = 1.5200


100%|██████████| 14/14 [00:00<00:00, 57.35it/s]


[7 / 50] Train Loss = 1.5294, Val Loss = 1.6124


100%|██████████| 14/14 [00:00<00:00, 60.06it/s]


[8 / 50] Train Loss = 1.5416, Val Loss = 1.5548


100%|██████████| 14/14 [00:00<00:00, 58.63it/s]


[9 / 50] Train Loss = 1.4784, Val Loss = 1.5463


100%|██████████| 14/14 [00:00<00:00, 51.34it/s]


[10 / 50] Train Loss = 1.4689, Val Loss = 1.5665
Early Stopping Triggered. Training Stopped.
	Best Epoch = 4, Best Val Loss = 1.5174845854441326


In [10]:
# evaluate model performance - post traing
post_train_acc = training.evaluate_nn_model_against_test_set(
    model = base_model,
    test_dataset = lyrics_test
)

print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

Model Accuracy = 45.56%
Training Improvement On Accuracy = +25.56%


### Train - Homemade Transformer

In [11]:
# define the transformer model
n_songs, embed_dim = lyrics_embed.shape
transformer_model = distilbert_clf.DistilBertDeepTransformer(
    input_dim = embed_dim,
    num_heads = NUM_HEADS,
    hidden_dim = HIDDEN_SIZE,
    output_dim = len(lyrics['genre'].unique()),
    num_layers = NUM_LAYERS,
    dropout = DROPOUT
).to(DEVICE)
transformer_model

DistilBertDeepTransformer(
  (attention_layers): ModuleList(
    (0-1): 2 x MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
    )
  )
  (ffn_layers): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=768, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=768, bias=True)
      (3): Dropout(p=0.5, inplace=False)
    )
  )
  (norm_layers_attn): ModuleList(
    (0-1): 2 x LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (norm_layers_ffn): ModuleList(
    (0-1): 2 x LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (fc): Linear(in_features=768, out_features=6, bias=True)
)

In [12]:
# evaluate model performance - pre training
pre_train_acc = training.evaluate_nn_model_against_test_set(
    model = transformer_model,
    test_dataset = lyrics_test
)

Model Accuracy = 22.22%


In [13]:
# train the model
training.nn_training(
    model = transformer_model,
    train_loader = lyrics_train,
    val_loader = lyrics_val,
    learning_rate = LEARNING_RATE,
    num_epochs = NUM_EPOCHS,
    patience = PATIENCE,
    verbose = True,
    print_every = 1
)

100%|██████████| 14/14 [00:00<00:00, 25.56it/s]


[1 / 50] Train Loss = 2.8262, Val Loss = 2.0594 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 27.35it/s]


[2 / 50] Train Loss = 1.9673, Val Loss = 1.9006 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 27.62it/s]


[3 / 50] Train Loss = 1.8854, Val Loss = 1.8952 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 28.69it/s]


[4 / 50] Train Loss = 1.9160, Val Loss = 1.8079 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 26.18it/s]


[5 / 50] Train Loss = 1.8709, Val Loss = 2.9883


100%|██████████| 14/14 [00:00<00:00, 25.39it/s]


[6 / 50] Train Loss = 2.0471, Val Loss = 1.7985 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 22.71it/s]


[7 / 50] Train Loss = 1.8604, Val Loss = 1.9168


100%|██████████| 14/14 [00:00<00:00, 22.94it/s]


[8 / 50] Train Loss = 1.8826, Val Loss = 1.8272


100%|██████████| 14/14 [00:00<00:00, 21.13it/s]


[9 / 50] Train Loss = 1.8541, Val Loss = 1.7923 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 24.68it/s]


[10 / 50] Train Loss = 1.8627, Val Loss = 1.8472


100%|██████████| 14/14 [00:00<00:00, 26.41it/s]


[11 / 50] Train Loss = 1.8609, Val Loss = 1.7810 **New Best Model**


100%|██████████| 14/14 [00:00<00:00, 22.05it/s]


[12 / 50] Train Loss = 1.8587, Val Loss = 1.8191


100%|██████████| 14/14 [00:00<00:00, 25.04it/s]


[13 / 50] Train Loss = 1.8441, Val Loss = 1.8054


100%|██████████| 14/14 [00:00<00:00, 27.53it/s]


[14 / 50] Train Loss = 1.8686, Val Loss = 1.8801


100%|██████████| 14/14 [00:00<00:00, 28.77it/s]


[15 / 50] Train Loss = 1.8624, Val Loss = 1.7923


100%|██████████| 14/14 [00:00<00:00, 28.26it/s]


[16 / 50] Train Loss = 1.8443, Val Loss = 1.8580
Early Stopping Triggered. Training Stopped.
	Best Epoch = 10, Best Val Loss = 1.7809595664342244


In [14]:
# evaluate model performance - post traing
post_train_acc = training.evaluate_nn_model_against_test_set(
    model = transformer_model,
    test_dataset = lyrics_test
)

print(f'Training Improvement On Accuracy = +{(post_train_acc - pre_train_acc) * 100:.2f}%')

Model Accuracy = 15.56%
Training Improvement On Accuracy = +-6.67%


## Custom Lyric Genre Prediction

In [None]:
# make a prediction for a custom song
custom_lyrics = "You can't hurry love.\nNo you'll just have to wait."

genre_classification.distilerbert_clf_prediction(
    lyrics = custom_lyrics,
    clf_model = transformer_model,
    label_mapping = genre_map,
    device = DEVICE
)

Lyrics:
You can't hurry love.
No you'll just have to wait. Country roads.

Predicted Genre: misc (idx = 1)


'misc'