<a href="https://colab.research.google.com/github/YashashGaurav/poetai/blob/master/evaluations/rhyming_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

We have heavily relied on the metrics as proposed by @dexios1 - Chris Dare, @mfogelson - Mitchell Fogelson, @xinkaichen97 - Xinkai Chen, @T0ny8576 - Qifei Dong
at
https://github.com/mfogelson/11-785_project/blob/main/rhyming_evaluation.ipynb

# Install libraries

In [None]:
!pip install pronouncing -q

[?25l[K     |▍                               | 10 kB 15.4 MB/s eta 0:00:01[K     |▊                               | 20 kB 21.3 MB/s eta 0:00:01[K     |█                               | 30 kB 13.4 MB/s eta 0:00:01[K     |█▍                              | 40 kB 4.8 MB/s eta 0:00:01[K     |█▊                              | 51 kB 4.9 MB/s eta 0:00:01[K     |██                              | 61 kB 5.7 MB/s eta 0:00:01[K     |██▍                             | 71 kB 5.7 MB/s eta 0:00:01[K     |██▉                             | 81 kB 6.3 MB/s eta 0:00:01[K     |███▏                            | 92 kB 7.0 MB/s eta 0:00:01[K     |███▌                            | 102 kB 5.5 MB/s eta 0:00:01[K     |███▉                            | 112 kB 5.5 MB/s eta 0:00:01[K     |████▏                           | 122 kB 5.5 MB/s eta 0:00:01[K     |████▌                           | 133 kB 5.5 MB/s eta 0:00:01[K     |████▉                           | 143 kB 5.5 MB/s eta 0:00:01[K  

# Load dependencies

In [None]:
import re
from pathlib import Path
from typing import List, Union

import numpy as np
import pronouncing
import pandas as pd

# Load dataset

In [None]:
lims = pd.read_csv('limericks_no_punc_digit.csv', header = None)
lims.head()

Unnamed: 0,0
0,capn jack was washed over the side\nhis crew s...
1,as a soup bisque is best when served hot\nmade...
2,simply add to the grasp of a rhesus\nthe antit...
3,abeds where you sleep in the night\nunless you...
4,a smiling young fellow from spain\nfell asleep...


## Codebase

In [None]:
class Limerick:
    
    def __init__(self, lines: str, rhyme_patterns: list, max_length:int=None):
        self.verse_lines = lines
        if max_length:
            self.verse_lines = self.verse_lines[0:max_length]
        self.last_words = [line.split()[-1] for line in self.verse_lines]
        self.last_word_rhyming_part_pairs = {word:self.__get_rhyming_parts(word) for word in self.last_words}
        self.rhyme_patterns = rhyme_patterns
 
    def __get_phonemes(self, text):
        """returns all possible pronunciation of a word as phonemes
        Language used: American English. Style: Arpabet
        """
        if type(text) == str:
            phonemes = pronouncing.phones_for_word(text)
        else:
            phonemes = [pronouncing.phones_for_word(word) for word in text]
        return phonemes
    
    def __get_rhyming_parts(self, word:str):
        phonemes = self.__get_phonemes(word)
        rhyming_parts = [pronouncing.rhyming_part(phoneme) for phoneme in phonemes]
        return rhyming_parts
    
    
    def __get_valid_rhyme_patterns(self):
        valid_patterns = [pattern for pattern in self.rhyme_patterns if 
            not any(i > len(self.verse_lines)-1 for i in pattern)
        ]
        return valid_patterns

    def score(self, line_pair):
        first_word = self.last_words[line_pair[0]]
        second_word = self.last_words[line_pair[1]]
        first_word_rhymes = self.__get_rhyming_parts(first_word) 
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_score = 0
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                is_rhyming = first_word_rhyme == second_word_rhyme
                if (is_rhyming):
                    rhyme_score = 1
                    status = "successfully matched"
                else:
                    status = "could not match"
                # uncomment to debug
                print(f" {status} -> {first_word}({first_word_rhyme}) and {second_word}({second_word_rhyme})")
        return int(rhyme_score)
    
    
    def score_edit_distance(self, line_pair):
        first_word = self.last_words[line_pair[0]]
        second_word = self.last_words[line_pair[1]]
        first_word_rhymes = self.__get_rhyming_parts(first_word) 
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_scores = []
#         import pdb; pdb.set_trace()
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                distance = calculate_edit_distance(first_word_rhyme,second_word_rhyme)
                rhyme_scores.append(distance)
        # there's a possibility that rhyme_scores will be an empty list.
        # this will be if no rhyming parts for a given set of words is found.
        # in that case, return None
        if len(rhyme_scores) > 0:
            rhyme_score =  min(rhyme_scores)
        else:
            rhyme_score = None
        return rhyme_score
    
        
    def get_rhyme_score(self):
        """returns a rhyming score for the poem between 0 and 1.
        """
        valid_patterns = self.__get_valid_rhyme_patterns()
        scores = [self.score(pattern) for pattern in valid_patterns]
        return sum(scores)/len(scores)

    def __repr__(self):
        return repr("\n".join(self.verse_lines))

In [None]:
def test_scoring_limerick(limerick_lines):
    """Sanity check to test scoring of a single limerick
    """
    limerick = Limerick(lines=limerick_lines, rhyme_patterns=limerick_pattern, max_length=5)
    print("Scoring limerick...")
    score = limerick.get_rhyme_score()
    print(f"Rhyme score is {score}")
    return score

Testing

In [None]:
test = str(lims.iloc[1][0]).split('\n')[:-1]
test

['as a soup bisque is best when served hot',
 'made with lobster it hits the right spot',
 'i think it tastes dreamy',
 'its so rich and creamy',
 'its the soup youd be served on a yacht']

In [None]:
test2 = ['when asked for some detail  its clear',
 'its the claim they must be clear', 
 'that this date is too long', 
 'but i cant understand why', 
 'so i tried my best not have you see']

In [None]:
# sample_rhyme = Path("samples/sample_rhyme.txt")
limerick_pattern = [
    [0,1], [2,3], [0,4]
]
test_scoring_limerick(test2)

Scoring limerick...
 successfully matched -> clear(IH1 R) and clear(IH1 R)
 could not match -> long(AO1 NG) and why(AY1)
 could not match -> long(AO1 NG) and why(AY1)
 could not match -> clear(IH1 R) and see(IY1)
Rhyme score is 0.3333333333333333


0.3333333333333333

In [None]:
def convert_generated_text_to_list(path):
  pattern = r'-- [A-Za-z]+ [0-9]+ --'
  with open(path) as file:
      text = file.read()
  limmericks = re.split(pattern, text)
  limmericks = [re.split(r'[0-9]: ', l) for l in limmericks]
  limmericks = [l for l in limmericks if l != ' ' or l != '']
  limmericks5 = []
  for i in limmericks:
    if len(i) >= 5:
      limmericks5.append(i[1:])
  return limmericks5

In [None]:
limmericks = convert_generated_text_to_list('generation_log.txt')

In [None]:
rhyme_scores = []
for l in limmericks:
  try:
    rhyme_scores.append(test_scoring_limerick(l))
  except:
    print('Not a valid limmerick')

In [None]:
print(rhyme_scores)

In [None]:
rhyming_lims = [i for i in rhyme_scores if i > 0]
print(len(rhyming_lims))

6


## Random

In [None]:
def sigmoid(X):
   return 1/(1+np.exp(-X))

In [None]:
def calculate_edit_distance(phoneme_set_a: List[str], phoneme_set_b: List[str], levenshtein=True):
    """Calculates edit distance between 2 sets of phonemes
    
    Parameters
    ----------
    phoneme_set_a: list
        word or rhyming part to be compared to. 
        This is represented as a string or list of phonemes representing a word or its rhyming part.
    phoneme_set_a: list
        word or rhyming part for which we want compute how different it is from phoneme_set_a
        This is also represented as a string or a list of phonemes representing a word or its rhyming part.
    levenshtein: bool, default = True
        Boolean indicating whether the distance should be conputed as Levenshtein distance or not
        
    Examples
    --------
    wonder = ["AH1","N","D","ER0"]
    one = ["AH1","N"]
    
    difference = calculate_edit_distance(wonder, one, levenshtein=False)
    
    This can be updated with a faster, dynamic program approach
    """
#     aligned_phoneme_set_a = []
    substitution_cost = 0
    insertion_cost = 0
    deletion_cost = 0
    aligned_phoneme_set_b = list(phoneme_set_b)
    
    index_counter = 0
    while index_counter < len(phoneme_set_b) - 1:
#         if phoneme_set_a[index_counter] == aligned_phoneme_set_b[index_counter]:
#             continue
            # aligned_phoneme_set_b[index_counter] = phoneme_set_b[index_counter]
        if index_counter > 0:
            if phoneme_set_a[index_counter-1] == aligned_phoneme_set_b[index_counter]:
                aligned_phoneme_set_b.insert(index_counter, None)
        # else, skip. it requires a substitution
        index_counter = index_counter + 1
    
    deletion_cost = abs(len(phoneme_set_a) - len(aligned_phoneme_set_b))
    aligned_phoneme_set_b = aligned_phoneme_set_b[-len(phoneme_set_a):]

#     index_counter = len(aligned_phoneme_set_b) - 1
    for i in range(len(aligned_phoneme_set_b)):
        if aligned_phoneme_set_b[i] == None:
            insertion_cost = insertion_cost + 1
        elif phoneme_set_a[i] != aligned_phoneme_set_b[i]:
            substitution_cost = substitution_cost + 1
        # else, continue
    
    # compute total costs
    if levenshtein:
        substitution_cost = substitution_cost * 2
        
    print(f"aligned_phoneme_set_b: {aligned_phoneme_set_b}")
        
    print(f"deletion cost: {deletion_cost}")
    print(f"insertion cost: {insertion_cost}")
    print(f"substitution cost: {substitution_cost}")
    
    total_cost = deletion_cost + insertion_cost + substitution_cost
    
    return total_cost

In [None]:
difference = calculate_edit_distance("execution", "intention")
difference