# Deep Model Building Notebook

__Objective:__ Import the GPT2 model from hugging-face. Fine to its performance against our data set (`song_lyrics_clean.csv`).

## Packages

__Non Code Package Folders__  
- data  
   - Contains the CSV files we read in  
- models  
   - Contains the state dict for the trained models  

In [1]:
# packages

## custom files

### data gathering = where the csv file is read in
### genre classification = uses the trained models to predict a custom song genre
from project_code import data_gathering, genre_classification

### distilbert_clf = Where Nick has made his neural networks based on DistilBERT
from architectures import nn_clf

### distilbert = Pey's work on distilbert embedding
### glove = Cassidy's work on glove embedding
from embedding import distilbert, glove

### preprocessing = for splitting tensors into numerous sets, and loading into DataLoader
### training = Contains a training loop for training a model against train and val
from modeling import preprocessing, training

In [2]:
# Import Libraries
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load dataset (ensure dataset has 'lyrics' and 'genre' columns)
df = pd.read_csv('data/song_lyrics_clean.csv', nrows=90000)
# df = pd.read_csv('data/song_lyrics_clean.csv')

In [4]:
# distinct_genres = df['genre'].unique()
# print(df.head(5))
# print(distinct_genres)

In [5]:
# Encode labels
label_encoder = LabelEncoder()
df["genre_label"] = label_encoder.fit_transform(df["genre"])
# df["genre_label"] = label_encoder.fit_transform(df["genre"]).astype(float)
distinct_labels = df['genre_label'].unique()
num_labels = len(label_encoder.classes_)

In [6]:
# print(num_labels)
# print(label_encoder.classes_)
# print(distinct_labels)

In [7]:
# Initialize tokenizer - This is used to convert the text data(lyrics) to a format understandable by GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a padding token by default

In [8]:
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["lyrics"], padding="max_length", truncation=True, max_length=512)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

In [9]:
# Prepare Dataset for model
dataset = dataset.rename_column("genre_label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [10]:
# dataset.data

In [11]:
# Split dataset
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [12]:
# print(train_dataset)
# print(train_dataset.features)

# from datasets import Dataset, Features, Value, Sequence

# # Define the new feature schema with the updated 'labels' column type
# new_features = Features({
#     'lyrics': Value(dtype='string'),
#     'genre': Value(dtype='string'),
#     'labels': Value(dtype='float32'),  # Change to float32
#     'input_ids': Sequence(feature=Value(dtype='int32'), length=-1),
#     'attention_mask': Sequence(feature=Value(dtype='int8'), length=-1)
# })

# # Re-create the dataset with the updated features
# train_dataset = train_dataset.cast(new_features)

# print(train_dataset.features)

In [13]:
# Load GPT-2 model with classification head
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_labels)
model.config.pad_token_id = model.config.eos_token_id  # Ensure padding works correctly

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
)

In [15]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
)

In [16]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2395,0.0893
2,0.001,0.101784
3,0.1178,0.111366


TrainOutput(global_step=60750, training_loss=0.09222020588848293, metrics={'train_runtime': 47700.0867, 'train_samples_per_second': 5.094, 'train_steps_per_second': 1.274, 'total_flos': 6.3495110393856e+16, 'train_loss': 0.09222020588848293, 'epoch': 3.0})

In [17]:
# Save model and tokenizer
model.save_pretrained("./fine_tuned_gpt2_genre")
tokenizer.save_pretrained("./fine_tuned_gpt2_genre")

('./fine_tuned_gpt2_genre/tokenizer_config.json',
 './fine_tuned_gpt2_genre/special_tokens_map.json',
 './fine_tuned_gpt2_genre/vocab.json',
 './fine_tuned_gpt2_genre/merges.txt',
 './fine_tuned_gpt2_genre/added_tokens.json')

In [18]:
# Evaluate the Model

# Evaluate on Test Data
results = trainer.evaluate(test_dataset)
print(results)

{'eval_loss': 0.08929992467164993, 'eval_runtime': 408.2663, 'eval_samples_per_second': 22.044, 'eval_steps_per_second': 5.511, 'epoch': 3.0}


In [19]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
import numpy as np

# Get predictions and labels
predictions, labels, _ = trainer.predict(test_dataset)

# For multi-class, convert probabilities to class labels (choose class with max probability)
predicted_labels = np.argmax(predictions, axis=1)

# Define all possible class labels, for example, [0, 1, 2] for a 3-class problem
all_possible_labels = distinct_labels  # Or specify your own list like [0, 1, 2]

# Generate Confusion matrix with all possible class labels
cm = confusion_matrix(labels, predicted_labels, labels=all_possible_labels)
print(cm)

[[8642   41]
 [ 120  197]]


In [20]:
# Classification Report
from sklearn.metrics import classification_report

# Generate Classification report
print(classification_report(labels, predicted_labels, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

     country       0.99      1.00      0.99      8683
        misc       0.83      0.62      0.71       317

    accuracy                           0.98      9000
   macro avg       0.91      0.81      0.85      9000
weighted avg       0.98      0.98      0.98      9000

