# Install and Import Libraries

In [1]:
!pip install --upgrade transformers --quiet
!pip install evaluate --quiet
!pip install --upgrade sentencepiece --quiet

import pandas as pd
import numpy as np

import evaluate

from transformers import T5ForConditionalGeneration, T5Tokenizer

# Instantiate T5 Models & Tokenizers

In [2]:
baseline_checkpoint = "t5-large"
finetuned_checkpoint = "/Users/mikelu/extssd/git/ucb_mids_W266_project/t5-large-lyric-generation/"

max_length = 512

baseline_model = T5ForConditionalGeneration.from_pretrained(
    baseline_checkpoint
)
baseline_tokenizer = T5Tokenizer.from_pretrained(
    baseline_checkpoint,
    model_max_length=max_length
)

finetuned_model = T5ForConditionalGeneration.from_pretrained(
    finetuned_checkpoint
)
finetuned_tokenizer = T5Tokenizer.from_pretrained(
    finetuned_checkpoint,
    model_max_length=max_length
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Load Test Data

Each record represents a song, with lyrics stored as multi-line text in the "lyrics" column.

In [4]:
df = pd.read_csv("test_no_markers.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,id,title,tag,artist,year,lyrics
0,5465961,Be Where We Are,country,Julia Cole,2021,Cash only bar with a juke box\nPut a dollar in...
1,1781039,Only Daddy That’ll Walk the Line,country,Waylon Jennings,1968,Everybody knows you've been stepping on my toe...
2,672903,I Cant Be Your Sweetheart,country,The Carter Family,1998,Last night I told my heart's love\nAll under t...
3,934006,Fire Line Road,country,James McMurtry,2008,"My name is Alice Walker, they never told me wh..."
4,1962549,Run Around,country,Austin Lucas,2011,You want answers; I don't have any\nJust more ...
...,...,...,...,...,...,...
19995,2391145,Life,rap,Criss Lyric,2015,My niggas all about cash\nMy niggas tryna rack...
19996,7456521,Dumptruck,rap,Full Tac,2021,"Dump, dump, dump, dump, dump\nDump it\nBack it..."
19997,2412522,Real Nigga Files,rap,Kodak Black,2015,Real nigga files (Hell yeah)\nProject files (Y...
19998,7799558,Science Project,rap,Fly Anakin,2018,My bitch a choosy lover never fuck without a r...


# Generate Lyrics

For each song in the test dataset:

1. Construct a prompt consisting of a task prefix and up to 8 lines of lyrics
2. Tokenize the prompt
3. Have the model generate tokens representing the next line of song lyrics
4. Repeat steps 1-3 for the finetuned model.
5. Store results in a dictionary (to be converted to a DataFrame later)

The reference for each model prediction is the next line from the song's lyrics.



In [None]:
output_dict = [None] * len(df)

max_input_lyrics = 8        # number of lines of the song's lyrics to pass to the model

for i, row in df.iterrows():

    lyric_lines = row.lyrics.split("\n")

    # if the song has fewer than max_input_lyrics,
    # then use all but the last line of the lyrics
    # in the input prompt
    num_lyric_lines = max_input_lyrics
    if len(lyric_lines) < max_input_lyrics + 1:
         num_lyric_lines = len(lyric_lines) - 1

    # calculate the maximum number of words contained
    # in each line of the song's lyrics -- we'll pass
    # this to the model as the maximum number of new
    # tokens to generate
    num_words = [len(x.split(" ")) for x in lyric_lines]
    max_words = int(np.max([20, np.max(num_words)]))

    snippet = "\n".join(lyric_lines[0:num_lyric_lines])
    reference = lyric_lines[num_lyric_lines]

    # the baseline T5 model has a task prefix of "summarize:"
    # for text generation
    baseline_prompt = "summarize: " + snippet

    # tokenize the input
    baseline_input_tokens = baseline_tokenizer(
        baseline_prompt,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    # generate the output tokens
    baseline_output_tokens = baseline_model.generate(
        baseline_input_tokens["input_ids"],
        max_new_tokens=max_words,
        num_beams=2,
        repetition_penalty=1.2
    )

    # convert the output to a text string
    baseline_output = "".join(
        baseline_tokenizer.batch_decode(
            baseline_output_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
    )

    # repeat the steps above for the finetuned model,
    # which uses a different task prefix
    finetuned_prompt = "write next line for: " + snippet

    finetuned_input_tokens = finetuned_tokenizer(
        finetuned_prompt,
        max_length=max_length,
        truncation=True,
        return_tensors="pt"
    )

    finetuned_output_tokens = finetuned_model.generate(
        finetuned_input_tokens["input_ids"],
        max_new_tokens=max_words,
        num_beams=2,
        repetition_penalty=1.2
    )

    finetuned_output = "".join(
        finetuned_tokenizer.batch_decode(
            finetuned_output_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
    )

    # store the results to the output dictionary array
    output_dict[i] = {
        "song_id" : row.id,
        "input" : snippet,
        "reference" : reference,
        "baseline_output" : baseline_output,
        "finetuned_output" : finetuned_output
    }

# Convert Output Dictionary to DataFrame

In [None]:
results_df = pd.DataFrame(output_dict)
results_df

# Calculate rouge and bleu Scores

In [None]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

baseline_rouge_scores = rouge.compute(
    predictions=results_df["baseline_output"],
    references=results_df["reference"]
)

baseline_bleu_scores = bleu.compute(
    predictions=results_df["baseline_output"],
    references=results_df["reference"]
)

finetuned_rouge_scores = rouge.compute(
    predictions=results_df["finetuned_output"],
    references=results_df["reference"]
)

finetuned_bleu_scores = bleu.compute(
    predictions=results_df["finetuned_output"],
    references=results_df["reference"]
)

score_dict = [None] * 2
score_dict[0] = {
    "model" : "baseline",
    "bleu" : baseline_bleu_scores["bleu"],
    "rouge1" : baseline_rouge_scores["rouge1"],
    "rouge2" : baseline_rouge_scores["rouge2"],
    "rougeL" : baseline_rouge_scores["rougeL"]
}
score_dict[1] = {
    "model" : "finetuned",
    "bleu" : finetuned_bleu_scores["bleu"],
    "rouge1" : finetuned_rouge_scores["rouge1"],
    "rouge2" : finetuned_rouge_scores["rouge2"],
    "rougeL" : finetuned_rouge_scores["rougeL"]
}

score_df = pd.DataFrame(score_dict)
score_df