Dependency

In [1]:
! pip install gtts
! pip install pydub

Collecting gtts
  Downloading gTTS-2.4.0-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.4.0
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')
from gtts import gTTS
import math
import librosa
import os
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import words
import json
import random
import time

Mounted at /content/drive


# 1. Evaluation

## 1.1 Pitch Alignment: Pitch Contour Correlation


`Pitch contour of lyrics: text-to-speech, pitch extraction.`


`Pitch contour of melody: Given as {duration:, pitch:, text:}`


In [3]:
def text_to_note(syllable):
  # Specify the folder name
  output_folder = '/content/drive/My Drive/Capstone_LyricsGen/Baseline/Pitch_Extraction'
  # Create the output folder if it doesn't exist
  if not os.path.exists(output_folder):
      os.makedirs(output_folder)
  audio_path = os.path.join(output_folder, f"{syllable}.mp3")
  try:
    y, sr = librosa.load(audio_path) # Load the audio file
  except:
    tts = gTTS(text=syllable, lang='en')
    tts.save(audio_path)
    time.sleep(2)
    y, sr = librosa.load(audio_path) # Load the audio file
  pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr) # Extract pitch
  mean_pitch = pitches.mean()
  # Define a list of note names
  note_names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
  # Calculate the MIDI note number
  midi_note = 12 * (math.log2(mean_pitch / 440.0)) + 69
  octave = int(midi_note) // 12 - 1
  note_index = int(midi_note) % 12
  # Get the note name from the list and append the octave
  note_name = note_names[note_index]
  full_note_name = (note_name,octave)
  # print("Syllable:",syllable)
  # print('Pitch: '+ str(mean_pitch) +'Hz')
  # print('Assigned Note:',full_note_name)
  return full_note_name


In [4]:
def plot_melody(notes,labels):
    # print(labels)
    # print(notes)
    generated_notes = []
    for i in labels:
      generated_notes.append(text_to_note(i))
    # Create x-values (assuming an increment of 1 for each data point)
    # x = range(1, len(notes) + 1)
    # Set custom x-axis labels
    # plt.xticks(x, labels)
    # Define a mapping from note names to MIDI numbers
    note_names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
    note_to_midi = {note: idx + 60 for idx, note in enumerate(note_names)}
    # Convert note names to MIDI numbers
    midi_notes_ori = [note_to_midi[note[:-1]] + int(note[-1]) * 12 for note in notes]
    midi_notes_gen = [note_to_midi[note[0]] + int(note[1]) * 12 for note in generated_notes]
    # Plot the melody
    # plt.scatter(x, midi_notes_ori, marker='o', color='b')
    # plt.title('Original Melody Visualization')
    # plt.xlabel('Time')
    # plt.ylabel('MIDI Note Number')
    # plt.grid(True)
    # plt.show()
    # Create x-values (assuming an increment of 1 for each data point)
    # x = range(1, len(notes) + 1)
    # Set custom x-axis labels
    # plt.xticks(x, labels)
    # Plot the melody
    # plt.scatter(x, midi_notes_gen, marker='o', color='b')
    # plt.title('Generated Melody Visualization')
    # plt.xlabel('Time')
    # plt.ylabel('MIDI Note Number')
    # plt.grid(True)
    # plt.show()
    # Ensure both lists have the same length
    if len(midi_notes_ori) != len(midi_notes_gen):
        raise ValueError("Lists must have the same length")
    # Compute MSE
    n = len(midi_notes_ori)
    mse = sum((actual - predicted) ** 2 for actual, predicted in zip(midi_notes_ori, midi_notes_gen)) / n
    # print('MSE:',mse)
    return 10000 / mse if mse != 0 else float('inf')


## 1.2 Rhythm Alignment: Stress-Duration Rules


In [5]:
# Download the CMU Pronouncing Dictionary
nltk.download("cmudict")
# Load the CMU Pronouncing Dictionary
prondict = nltk.corpus.cmudict.dict()
# Function to determine whether a syllable is stressed
def is_stressed_syllable(word):
    # Convert the word to lowercase
    word = word.lower()

    # Check if the word exists in the CMU Pronouncing Dictionary
    if word in prondict:
        pronunciation = prondict[word][0]
        # print(pronunciation)
        # Determine if the last phoneme ends with a digit (indicating stress)
        return any(char.isdigit() for char in pronunciation[-1])
    return False


[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


## Completed Evaluation Metrics

In [6]:
def evaluate_lyrics_alignment(data):
    # Initialize alignment scores
    rhythm_alignment_score = 0
    pitch_alignment_score = 0

    # Iterate through paragraphs and lines
    for paragraph in data:
        for line in paragraph:
            syllables = [entry['text'] for entry in line]
            note_durations = [entry['duration'] for entry in line]
            note_pitches = [entry['pitch'] for entry in line]

            # 1. Rhythm Alignment:

            # Calculate the average duration to determine stressed and unstressed syllables
            avg_duration = sum(note_durations) / len(note_durations)

            # Notes of longer duration should match stressed syllables
            stress_flags = [is_stressed_syllable(syllable) for syllable in syllables]
            stress_note_durations = [note_durations[i] for i in range(len(syllables)) if stress_flags[i]]
            unstress_note_durations = [note_durations[i] for i in range(len(syllables)) if not stress_flags[i]]

            for s in stress_note_durations:
              if s > avg_duration:
                rhythm_alignment_score += 1
              else:
                rhythm_alignment_score -= 1
            for u in unstress_note_durations:
              if u < avg_duration:
                rhythm_alignment_score += 1
              else:
                rhythm_alignment_score -= 1

            # 2. Pitch Alignment:
            pitch_alignment_score = plot_melody(note_pitches, syllables)

    # Calculate overall alignment score
    num_lines = sum(len(paragraph) for paragraph in data)

    return {
        "Rhythm Alignment Score": rhythm_alignment_score / num_lines,
        "Pitch Alignment Score": pitch_alignment_score / num_lines,
    }



