# Load data

In [135]:
import pandas as pd

song_lyrics = pd.read_csv('./data/preprocessed-data/song_lyrics.csv')

In [136]:
song_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6342 entries, 0 to 6341
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   spotify_id  6342 non-null   object
 1   lyrics      6342 non-null   object
dtypes: object(2)
memory usage: 99.2+ KB


# Find all types of musical features

In [146]:
import re

feature_counts = {}
feature_with_number = set()

for lyrics in song_lyrics['lyrics']:
    # Find all text enclosed in square brackets, capturing feature names with optional numbers
    features_found = re.findall(r"\[([^\]:]+)(?: \d+)?\]", lyrics)
    unique_features = set()

    # Process each found feature
    for feature in features_found:
        # Ignore the [x2] token completely
        if feature.strip().lower() == 'x2' or feature.strip().lower() == '?':
            continue

        feature_base = re.sub(r" \d+", "", feature).strip().lower()  # Remove numbers and standardize
        unique_features.add(feature_base)

        if re.search(r" \d+", feature):
            feature_with_number.add(feature_base)

    # Update counts, considering each feature only once per song
    for feature in unique_features:
        feature_counts[feature] = feature_counts.get(feature, 0) + 1

# Filter and add features found more than 10 times to the set for tokenization
features_for_tokenization = set()
for feature, count in feature_counts.items():
    if count > 10:  # Refactor this number to be relative to the dataset if we add more data
        features_for_tokenization.add(feature)
        if feature in feature_with_number:
            features_for_tokenization.add(feature + r" \d+")

# Creating the pattern string for tokenization
features_pattern = r"(" + "|".join(features_for_tokenization) + ")"

print("Features pattern for tokenization:")
step = 5
features_for_tokenization = list(features_for_tokenization)
features_for_tokenization.sort()
for i in range(0, len(features_for_tokenization), step):
    print(features_for_tokenization[i:i+step])

Features pattern for tokenization:
['break', 'break \\d+', 'breakdown', 'bridge', 'bridge \\d+']
['chorus', 'chorus \\d+', 'coro', 'coro \\d+', 'guitar solo']
['hook', 'hook \\d+', 'instrumental', 'instrumental break', 'instrumental bridge']
['instrumental interlude', 'instrumental intro', 'instrumental outro', 'interlude', 'interlude \\d+']
['intro', 'intro \\d+', 'outro', 'outro \\d+', 'post-chorus']
['post-chorus \\d+', 'pre-chorus', 'pre-chorus \\d+', 'pre-coro', 'pre-coro \\d+']
['refrain', 'refrain \\d+', 'saxophone solo', 'solo', 'spoken']
['verse', 'verse \\d+', 'verso', 'verso \\d+']


# Tokenize

In [147]:
# NOTE: Verse number is not preserved in the tokens
# features_pattern = "(coro|pre-coro|outro|instrumental intro|intro|verso \d+|hook|post-chorus \d+|verso|chorus \d+|verse|interlude \d+|instrumental outro|pre-coro \d+|break|bridge|instrumental|instrumental break|outro \d+|interlude|break \d+|saxophone solo|instrumental interlude|refrain|bridge \d+|coro \d+|intro \d+|pre-chorus|pre-chorus \d+|chorus|hook \d+|verse \d+|instrumental bridge|spoken|refrain \d+|guitar solo|post-chorus|breakdown|solo)"
def tokenize_lyrics(lyrics):
    # Converting to lowercase
    lyrics = lyrics.lower()

    # Removing artist names from musical features
    lyrics = re.sub(fr"\[{features_pattern}:?[^\]]*?\]", r"[\1]", lyrics)

    # Splitting the lyrics into lines
    lines = lyrics.split('\n')

    # Tokenizing each line
    tokens = []
    for line in lines:
        # Check and skip the [?] token
        if '[?]' in line:
            line = re.sub(r"\[\?\]", "", line)  # Ignore the [?] token

        # Tokenizing special tokens like [x2] as "[x2]"
        if '[x2]' in line:
            tokens.append("[x2]")
            line = re.sub(r"\[x2\]", "", line)  # Remove the special token from the line

        # Tokenizing musical features as separate tokens, including those with numbers
        musical_features = re.findall(fr"\[{features_pattern}(?:\]|:[^\]]*\])", line)
        if musical_features:
            for feature in musical_features:
                # Capitalize the first letter of each word and add square brackets
                capitalized_feature = '[' + ' '.join(word.capitalize() for word in feature.split()) + ']'
                tokens.append(capitalized_feature)
            line = re.sub(fr"\[.*?\]", "", line).strip()

        # Tokenizing words, standalone punctuation, and ellipses as separate tokens
        # Added a pattern to capture ellipses before individual periods
        line_tokens = re.findall(r"\.{3}|[\w’']+(?:-[a-z’']+)?|[.,!?;]", line)
        tokens.extend(line_tokens)

        # Adding a token for line breaks
        tokens.append("\n")

    # Removing the last line break token
    if tokens and tokens[-1] == "\n":
        tokens.pop()

    return tokens

In [148]:
import pprint
def print_in_chunks(lst, chunk_size=10):
    for i in range(0, len(lst), chunk_size):
        chunk = lst[i:i+chunk_size]
        print(", ".join(str(item) for item in chunk))

lyric = song_lyrics['lyrics']
text1 = lyric[0]
tokens = tokenize_lyrics(text1)
pprint.pprint(tokens)

['[Intro]',
 '\n',
 "'cause",
 'sometimes',
 'you',
 'just',
 'feel',
 'tired',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'yo',
 ',',
 'left',
 '\n',
 'feel',
 'weak',
 'and',
 'when',
 'you',
 'feel',
 'weak',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'right',
 ',',
 'left',
 '\n',
 'you',
 'feel',
 'like',
 'you',
 'wanna',
 'just',
 'give',
 'up',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'yo',
 ',',
 'left',
 '\n',
 'but',
 'you',
 'gotta',
 'search',
 'within',
 'you',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'right',
 ',',
 'left',
 '\n',
 'try',
 'to',
 'find',
 'that',
 'inner',
 'strength',
 'and',
 'just',
 'pull',
 'that',
 'shit',
 'out',
 'of',
 'you',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'yo',
 ',',
 'left',
 '\n',
 'and',
 'get',
 'that',
 'motivation',
 'to',
 'not',
 'give',
 'up',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'right',
 ',',
 'left',
 '\n',
 'and',
 'not',
 'be',
 'a',
 'quitter',
 ',',
 'no',
 'matter',
 'how',
 'bad',
 '\n',
 'yo',
 ',',
 'left',
 ',',
 'yo',
 ',',
 'left',
 '\n',
 '