In [1]:
!pip install datasets
!pip install accelerate -U
import transformers
import torch
import os, sys, random, re, collections, string
import numpy as np
import csv
import sklearn.model_selection
import sklearn.metrics
import heapq
import matplotlib
from sklearn.model_selection import train_test_split
import tqdm
from datasets import load_dataset
from transformers import AutoModelForCausalLM
from collections import Counter
from transformers import BertTokenizer
from transformers import XLNetTokenizer
from transformers import pipeline
from transformers import TrainingArguments
from transformers import BertForSequenceClassification, XLNetForSequenceClassification
from transformers import Trainer



tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer_xlnet = XLNetTokenizer.from_pretrained("xlnet-base-cased")

if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
else:
    print("CUDA is not available. Training on CPU.")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


CUDA is available. Training on GPU.


In [2]:
emotion_pipeline = pipeline("text-classification", model = "joeddav/distilbert-base-uncased-go-emotions-student",device = device)
dataset = load_dataset("chloeliu/lyrics")


config.json:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/20.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28372 [00:00<?, ? examples/s]

In [3]:
def classify_emotions_batch(batch):
    texts = [' '.join(lyrics) for lyrics in batch['lyrics']]
    texts = [text[:512] for text in texts]
    results = emotion_pipeline(texts)
    dominant_emotions = []
    for result in results:
        if result:
            dominant_emotions.append(result['label'])
        else:
            dominant_emotions.append('None')
    return {'emotion': dominant_emotions}

emotion_labeled_dataset = dataset.map(classify_emotions_batch, batched=True)

Map:   0%|          | 0/28372 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
emotion_labeled_dataset.save_to_disk('emotion_labeled_dataset-new')

Saving the dataset (0/1 shards):   0%|          | 0/28372 [00:00<?, ? examples/s]

In [None]:
emotion_labeled_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre', 'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time', 'shake the audience', 'family/gospel', 'romantic', 'communication', 'obscene', 'music', 'movement/places', 'light/visual perceptions', 'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability', 'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy', 'topic', 'age', 'emotion'],
        num_rows: 28372
    })
})

In [4]:
def get_label_dict(dataset):
    unique_labels = sorted(set(dataset['emotion']))
    label_dict = {label: idx for idx, label in enumerate(unique_labels)}
    return label_dict

label_dict = get_label_dict(emotion_labeled_dataset['train'])

def preprocess_function(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['lyrics'], truncation=True, max_length=512, padding="max_length")

    tokenized_inputs['labels'] = [label_dict[label] for label in examples['emotion']]

    return tokenized_inputs

def tokenize_function(examples):
    return preprocess_function(examples, tokenizer_bert)
def tokenize_function_xlnet(examples):
    return preprocess_function(examples,tokenizer_xlnet)

In [None]:
train_test_split = emotion_labeled_dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split['train'].map(tokenize_function, batched=True)
test_dataset = train_test_split['test'].map(tokenize_function, batched=True)
train_dataset_xlnet = train_test_split['train'].map(tokenize_function_xlnet, batched=True)
test_dataset_xlnet = train_test_split['test'].map(tokenize_function_xlnet, batched=True)
num_labels = len(label_dict)

In [9]:



model_bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
model_bert.to(device)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)


trainer_bert = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer_bert.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.1766
1000,0.0435
1500,0.0311
2000,0.0324
2500,0.033
3000,0.0292
3500,0.0263
4000,0.015


TrainOutput(global_step=4257, training_loss=0.04663395540014746, metrics={'train_runtime': 3527.8344, 'train_samples_per_second': 19.301, 'train_steps_per_second': 1.207, 'total_flos': 1.7915816582516736e+16, 'train_loss': 0.04663395540014746, 'epoch': 3.0})

In [10]:
trainer_bert.evaluate()

{'eval_loss': 0.0450221486389637,
 'eval_runtime': 108.2145,
 'eval_samples_per_second': 52.442,
 'eval_steps_per_second': 3.281,
 'epoch': 3.0}

In [15]:
predictions = trainer_bert.predict(test_dataset)


In [17]:
predicted_labels = np.argmax(predictions.predictions, axis=-1)


In [20]:
print(label_dict)

{'admiration': 0, 'caring': 1, 'confusion': 2, 'realization': 3}


In [None]:
def get_reverse_label_dict(label_dict):
    reverse_label_dict = {idx: label for label, idx in label_dict.items()}
    return reverse_label_dict

reverse_label_dict = get_reverse_label_dict(label_dict)

In [None]:
model_xlnet = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels)
model_xlnet.to(device)
training_args_xlnet = TrainingArguments(
    output_dir='./results_xlnet',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_xlnet',
    evaluation_strategy="epoch"
)

trainer_xlnet = Trainer(
    model=model_xlnet,
    args=training_args_xlnet,
    train_dataset=train_dataset_xlnet,
    eval_dataset=test_dataset_xlnet
)