----
## Setup

**Configure Colab File System**

Make sure you shortcut the base folder to MyDrive!
> `your_folder` -> `Organize` -> `Add shortcut`



![](https://drive.google.com/uc?export=view&id=1sxFMcaAAWTGfFYyTlmvFZmEgfNdeU0Ks)

<br></br>


----
If you want to let your colab keep running while you're gone, copy-paste this into the editor console:
> - `option` + `command` + `i`
- `copy-paste` the snippet into the *console*, then hit `enter`
- check for printouts at 60s mark, then run your job

```javascript
function ClickConnect(){
    console.log("Click #1");
    document.querySelector("colab-toolbar-button#connect").click();
}
setInterval(ClickConnect,60000);
```

<br></br>


In [1]:
# @title Colab Setup
ROOT='/content/drive/MyDrive/W210 Capstone - Lyric Generation with Melody/loaf/'
import importlib

# add root to system path
import sys
sys.path.append(ROOT)

# pytorch env vars
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

# filter out warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

'''
install_if_needed():
  - Install packages using pip if they are not already installed.
'''
def install_if_needed(package_names: list[str]):
    """

    """
    print('installing packages')
    if isinstance(package_names, str):
        package_names = [package_names]

    newly_installed = []
    for package_name in package_names:
        try:
            importlib.import_module(package_name)
            print(f"- {package_name} is already installed.")
        except ImportError:
            !pip install --quiet {package_name} --upgrade &> /dev/null
            newly_installed.append(package_name)

    print(f"- installed {', '.join(newly_installed)}")
    print(f"- imported {', '.join([x for x in package_names if x not in newly_installed])}")


'''
running_in_colab():
    - env check
'''
def running_in_colab():
    try:
        import google.colab

        return True
    except ImportError:
        return False


'''
mount_to_drive():
    - Mount notebook to colab file directory
'''
def mount_to_drive():
    if running_in_colab():
      print('\nmounting to drive')
      from google.colab import drive
      drive.mount("/content/drive", force_remount=True)


'''
unpack_elasticsearch():
    - Get elasticsearch jar file if necessary
'''
def unpack_elasticsearch():
  """
  """
  ![[ -d /content/elasticsearch-8.11.1 ]] && echo "- elasticsearch is already installed" \
                                           || wget "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-8.11.1-linux-x86_64.tar.gz" \
                                                && tar -xzf "elasticsearch-8.11.1-linux-x86_64.tar.gz" 1> /dev/null && echo "- elasticsearch has been installed"


'''
force_install_torch():
    - reinstall torch to avoid weird error https://stackoverflow.com/a/77199918
'''
def force_install_torch():
  %pip install torch --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cu117



In [2]:
# @title Install Dependencies
%%capture
# rip
mount_to_drive()
install_if_needed([
# "faiss",
# "dill",
"accelerate",
"apache-beam",
"datasets",
# "elasticsearch",
# "faiss_gpu",
"evaluate",
"keybert",
# "nlp",
"python-dotenv",
"pynvml",
# "rouge_score",
"sentencepiece",
"sentence-transformers",
"transformers",
# "git+https://github.com/huggingface/transformers", # source install for checkpointing
# "pretty_midi",
# "pypianoroll",
"unicodedata",
"yake",
])

# system stuff
import ast
# from collections import Counter
# from dotenv import dotenv_values
# dotenv = dotenv_values(f"{ROOT}/.env")
import numpy as np
from keybert import KeyBERT
import pandas as pd
import random
from math import ceil,floor
# import os
# # memory fragmentation & error tracing
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import re
# import string
import unicodedata

# big boiz
from datasets import load_dataset, Dataset, DatasetDict
# from huggingface_hub import login, logout
# import nlp
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.corpus import cmudict
# # Download the CMU Pronouncing Dictionary if not already downloaded
# nltk.download('cmudict')
# nltk.download('punkt')

# import tensorflow as tf
# from tensorflow import keras
# import torch
import yake

In [3]:
# @title Load Modules
%%capture
# from src.config import get_model_configs
from src.nlp import *
from src.process_lyrics import clean_lyrics, chunk_pipeline, get_preceding_chunk
from src.hf import save_hf_dataset, load_hf_dataset, save_csv_dataset
from src.gpu import clear_gpu, print_gpu_utilization
%reload_ext autoreload
%autoreload 2

# load our .env file as a dict
from dotenv import dotenv_values
dotenv = dotenv_values(f"{ROOT}/.env")


---



# Functions

In [None]:
# @title loaf_preprocessing.py

import ast
def load_array_cols(row):
  for arr_col in ['syl_count_by_measure', 'note_onsets_by_measure', 'note_durs_by_measure', 'note_pitches_by_measure', 'note_stresses_by_measure', 'note_velocities_by_measure']:
    row[arr_col] = ast.literal_eval(row[arr_col])

  return row


'''
parse_loaf():
    - hf mapping function
    - utilizes existing tokens: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/tokenization_bart.py
'''
def parse_loaf(row):
  # lines
  lines = row['clean_lyrics'].split('\n')
  # create keyword extractor obj
  # kw_model = KeyBERT()
  kw_extractor = yake.KeywordExtractor(n = 1, top = 3, dedupLim=0.8, dedupFunc='seqm')
  # extract keywords for training
  kws_per_line = [[kw for kw in kw_extractor.extract_keywords(line)] for line in lines]
  # sort by score
  for kws in kws_per_line: kws.sort(key=lambda l: l[1])
  # grab only the word, not the score
  kws_per_line = [[kw[0] for kw in kws] for kws in kws_per_line if len(kws) >= 1]
  # convert to string
  kws_string_per_line = [f"{', '.join(kws)}" for i, kws in enumerate(kws_per_line)]
  masks_per_line = [f"{', '.join(['<mask>']*len(kws))}" for i, kws in enumerate(kws_per_line)]
  # create plan prompt
  plan_prompt = f"Generate a plan for the lyrics given the song summary: {str(row['lyric_summary_bartv2'])}, genre: {str(row['genre'])}, and topic: {str(row['topic'])}. "
  # create dict to return w/ special token representations
  row_dict = {
      'midi_id': row['midi_id'],
      'orig_syl_tokens': " </s> ".join([f'len_{x}' for x in row["syl_count_by_measure"]]),
      'orig_note_onsets_tokens': " </s> ".join([' '.join([f'onset_{np.round(float(val), 1)} </s>' for val in line]) for line in row["note_onsets_by_measure"]]),
      'orig_note_dur_tokens': " </s> ".join([' '.join([f'dur_{np.round(float(val), 1)}' for val in line]) for line in row["note_durs_by_measure"]]),
      'orig_note_pitch_tokens': " </s> ".join([' '.join([f'pitch_{val}' for val in line]) for line in row["note_pitches_by_measure"]]),
      'orig_note_vel_tokens': " </s> ".join([' '.join([f'vel_{val}' for val in line]) for line in row["note_velocities_by_measure"]]),
      'orig_note_stress_tokens': " </s> ".join([' '.join([f'stress_{val}' for val in line]) for line in row["note_stresses_by_measure"]]),
      'plan_orig': f"{plan_prompt} {'. '.join(masks_per_line)}",
      'lines': lines,
      'kws_per_line': kws_per_line,
      'plan_target': f"{'. '.join(kws_string_per_line)}",
      'lyrics_target': " <P> ".join(row['clean_lyrics'].split('\n')),
  }

  return row_dict


'''
preprocess_loaf_data():
    - bread & butter
'''
def preprocess_loaf_data(dotenv, hf_base='adamjweintraut/loaf', write_data=False):
  df = pd.read_csv(dotenv['source_data'])
  # add in new lyric summaries / cleaned lyrics & merge
  summ_df = load_dataset('adamjweintraut/bart_cnn_lyric_summaries', split='etl').to_pandas()[['midi_id', 'clean_lyrics', 'lyric_summary_bartv2']]
  df = pd.merge(df, summ_df, on='midi_id')
  # load splits
  train_ids = pd.read_csv(f"{dotenv['processed']}/train_dataset.csv")[['midi_id', 'topic']]
  test_ids = pd.read_csv(f"{dotenv['processed']}/test_dataset.csv")[['midi_id', 'topic']]
  valid_ids = pd.read_csv(f"{dotenv['processed']}/valid_dataset.csv")[['midi_id', 'topic']]
  # create splits to save
  train_df, test_df, valid_df = tuple([pd.merge(df, id_set, on='midi_id') for id_set in [train_ids, test_ids, valid_ids]])
  # save to csv first
  train_df.to_csv(dotenv['loaf_train'])
  test_df.to_csv(dotenv['loaf_test'])
  valid_df.to_csv(dotenv['loaf_valid'])
  # delete to save ram
  del df, train_ids, test_ids, valid_ids, train_df, test_df, valid_df
  # load hf dataset
  dataset = load_dataset('csv', data_files={'train': dotenv['loaf_train'], 'test': dotenv['loaf_test'], 'valid': dotenv['loaf_valid']}, delimiter=',')
  # define cols we don't need
  cols_to_axe=[
    'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'Unnamed: 0_x',
    'msd_id', 'match_score', 'midi_filename', 'lpd_filename', 'lyrics_filename',
    'midi_end_time', 'midi_instruments', 'midi_program_numbers', 'lpd_resolution', 'lpd_tracks',
    'lpd_track_program_numbers', 'lpd_track_lengths', 'lyrics_time_events', 'lyrics_max_track_length',
    'artist', 'artist_terms', 'album', 'year_released', 'bpm', 'lyric_lang', 'syl_by_measure',
    'syl_count_by_measure', 'note_count_by_measure', 'measure_times', 'vocal_instrument_program', 'vocal_instrument_note_syl_corr',
    'vocal_instrument_name', 'note_onsets_by_measure', 'note_durs_by_measure', 'note_pitches_by_measure', 'note_velocities_by_measure', 'note_stresses_by_measure', 'clean_lyrics',
  ]
  ## clean & re-save
  for split in ['train', 'test', 'valid']:
    # run pipeline
    dataset[split] = dataset[split].map(clean_lyrics)
    dataset[split] = dataset[split].map(load_array_cols)
    dataset[split] = dataset[split].map(parse_loaf, remove_columns=cols_to_axe)
    # save to local csv
    save_csv_dataset(dataset[split], dotenv[f'loaf_{split}'])

  # write to hf
  if write_data:
    save_hf_dataset(dataset, dotenv, hf_base)

  return dataset




---



In [None]:
# @title lyrlen_preprocessing.py

'''
parse_lyrlen():
    - hf mapping function
'''
def rename_lyrlen_cols(row):
  return {'id': row['Unnamed: 0'], 'genre': row['tag']}

'''
create_lyrlen_inputs():
    - hf mapping function
'''
def create_lyrlen_inputs(row):
  sylls = count_line_syllables(row['clean_lyrics'])
  return  {'sylls': sylls, 'orig': f'len_{sylls}', 'target': row['clean_lyrics']}


# '''
# get_lyric_chunks():
#             - hf mapping function to convert lyrics to chunks
# '''
# def get_lyric_chunks(row, chunk_size):
#     return {'lyric_chunks': [' '.join(chunk) for chunk in list(split_into_chunks(get_lyric_lines(row['clean_lyrics']), chunk_size))]}


# '''
# explode_chunks():
#             - batched hf mapping function to create rows for each lyric chunk in a song
# '''
# def explode_chunks(batch, group):
#   obj = {}
#   # add constant groupby for each chunk
#   for g in group:
#     obj[g] = [_g for i, _g in enumerate(batch[g]) for _ in batch['lyric_chunks'][i]]
#   # add lyric chunk + id
#   obj["clean_lyrics"] = [chunk for chunk_list in batch["lyric_chunks"] for chunk in chunk_list]
#   obj["lyric_chunk_n"] = [i for chunk_list in batch["lyric_chunks"] for i, chunk in enumerate(chunk_list)]

#   return obj


    # return {
    #     "id": [id for i, id in enumerate(batch["id"]) for _ in batch["lyric_chunks"][i]],
    #     "title": [title for i, title in enumerate(batch["title"]) for _ in batch["lyric_chunks"][i]],
    #     "genre": [genre for i, genre in enumerate(batch["genre"]) for _ in batch["lyric_chunks"][i]],
    #     "lyrics": [lyrics for i, lyrics in enumerate(batch["lyrics"]) for _ in batch["lyric_chunks"][i]],
    #     "clean_lyrics": [chunk for chunk_list in batch["lyric_chunks"] for chunk in chunk_list],
    #     "lyric_chunk_n": [i for chunk_list in batch["lyric_chunks"] for i, chunk in enumerate(chunk_list)],
    # }


'''
preprocess_lyrlen_data():
    - bread & butter
'''
def preprocess_lyrlen_data(dotenv, n_examples_slice=1e6, hf_write_path='lyrlen', write_data=False):
  # load data
  lyrics_slice = load_hf_dataset('amishshah/song_lyrics',
                                 dotenv,
                                 'train',
                                 n_examples_slice,
                                 shuffle=True)
  # renamed to id & genre
  lyrics_slice = lyrics_slice.map(rename_lyrlen_cols, remove_columns=['Unnamed: 0', 'tag'])
  # clean lyrics & remove extra columns
  lyrics_slice = lyrics_slice.map(clean_lyrics, remove_columns=[col for col in lyrics_slice.features if col not in ['id', 'title', 'genre', 'lyrics', 'clean_lyrics']])
  # chunk lyrics
  lyrics_slice = chunk_pipeline(data=lyrics_slice, n_lines=1,
group=['id', 'title', 'genre', 'lyrics'])

  # lyrics = load_dataset(, split='train')
  # shuffle and grab randomized sample (param)
  # lyrics_shuffled = lyrics.shuffle(seed=42)
  # lyrics_slice = lyrics_shuffled.select(range(int(n_examples_slice)))



  # # split lyrics into chunks
  # lyrics_slice = lyrics_slice.map(get_lyric_chunks, fn_kwargs={'n_lines': 1})
  # # explode chunks into their own rows
  # lyrics_slice = lyrics_slice.map(explode_chunks,
  #                                 batched=True,
  #                                 fn_kwargs={'group': ['id', 'title', 'genre', 'lyrics']},
  #                                 remove_columns=['lyric_chunks'])


  # create inputs
  lyrics_slice = lyrics_slice.map(create_lyrlen_inputs, remove_columns=['lyrics', 'clean_lyrics'])
  # remove large syllable lines
  lyrics_slice = lyrics_slice.filter(lambda example: example["sylls"] < 30)
  # create train/test/valid splits
  lyrics_slice_train_test = lyrics_slice.train_test_split(test_size=0.2)
  lyrics_slice_test_valid = lyrics_slice_train_test['test'].train_test_split(test_size=0.5)
  del lyrics_slice # save ram
  # write to csv
  lyrics_slice_train_test['train'].to_csv(dotenv['lyrlen_train'])
  lyrics_slice_test_valid['train'].to_csv(dotenv['lyrlen_test'])
  lyrics_slice_test_valid['test'].to_csv(dotenv['lyrlen_valid'])
  # save ram
  del lyrics_slice_train_test, lyrics_slice_test_valid
  # construct hf dataset w/ splits
  dataset = load_dataset('csv', data_files={'train': dotenv['lyrlen_train'], 'test': dotenv['lyrlen_test'], 'valid': dotenv['lyrlen_valid']}, delimiter=',')

  if write_data:
    print('shmoney!') if save_hf_dataset(dataset, dotenv, hf_write_path) else print("booooooooo!")


  return dataset


In [None]:
# @title kwsylgen_preprocessing.py
'''
create_kwsylgen_inputs():
    - hf mapping function
'''
def create_kwsylgen_inputs(row):
  sylls = count_line_syllables(row['clean_lyrics'])
  kw_extractor = yake.KeywordExtractor(n = 1, top = max(floor(sylls / 2), 3))
  kws = kw_extractor.extract_keywords(row['clean_lyrics'])
  kws = [kw[0] for kw in kws if len(kws) >= 1]
  prev_line_phrase = f"naturally follow previous lyric line '{row['prev_clean_lyrics']}' and " if row['prev_clean_lyrics'] != '' else ''
  orig = f'''input {' '.join(kws)}. len_{sylls} output'''

  return  {'sylls': sylls, 'keywords': kws, 'orig': orig, 'target': row['clean_lyrics']}


'''
preprocess_kwsylgen_data():
    - bread & butter
'''
def preprocess_kwsylgen_data(dotenv, frac=0.1, n_lines=1, hf_write_path='kwsylgen', write_data=False):
  # update write path if we change the chunking
  hf_write_path = hf_write_path if n_lines == 1 else (hf_write_path + n_lines)
  df = pd.read_csv(dotenv['source_data'])
  # add in new lyric summaries / cleaned lyrics & merge
  summ_df = load_dataset('adamjweintraut/bart_cnn_lyric_summaries', split='etl').to_pandas()[['midi_id', 'clean_lyrics', 'lyric_summary_bartv2']]
  df = pd.merge(df, summ_df, on='midi_id')
  # load splits
  train_ids = pd.read_csv(f"{dotenv['processed']}/train_dataset.csv")[['midi_id', 'topic']]
  test_ids = pd.read_csv(f"{dotenv['processed']}/test_dataset.csv")[['midi_id', 'topic']]
  valid_ids = pd.read_csv(f"{dotenv['processed']}/valid_dataset.csv")[['midi_id', 'topic']]
  # create splits to save
  train_df, test_df, valid_df = tuple([pd.merge(df, id_set, on='midi_id') for id_set in [train_ids, test_ids, valid_ids]])
  # reduce
  train_df = train_df.sample(frac=frac, random_state=42)
  test_df = test_df.sample(frac=frac, random_state=42)
  valid_df = valid_df.sample(frac=frac, random_state=42)
  # save to csv first
  train_df.to_csv(dotenv[f'{hf_write_path}_train'])
  test_df.to_csv(dotenv[f'{hf_write_path}_test'])
  valid_df.to_csv(dotenv[f'{hf_write_path}_valid'])
  # delete to save ram
  del df, train_ids, test_ids, valid_ids, train_df, test_df, valid_df
  # load into hf dataset
  data = load_dataset('csv', data_files={'train': dotenv[f'{hf_write_path}_train'],
                                         'test': dotenv[f'{hf_write_path}_test'],
                                         'valid': dotenv[f'{hf_write_path}_valid']},
                             delimiter=',')
  # define cols we don't need
  cols_to_axe=[
    'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'Unnamed: 0_x',
    'msd_id', 'match_score', 'midi_filename', 'lpd_filename', 'lyrics_filename',
    'midi_end_time', 'midi_instruments', 'midi_program_numbers', 'lpd_resolution', 'lpd_tracks',
    'lpd_track_program_numbers', 'lpd_track_lengths', 'lyrics_time_events', 'lyrics_max_track_length',
    'artist', 'artist_terms', 'album', 'year_released', 'bpm', 'lyric_lang', 'syl_by_measure',
    'syl_count_by_measure', 'note_count_by_measure', 'measure_times', 'vocal_instrument_program', 'vocal_instrument_note_syl_corr',
    'vocal_instrument_name', 'note_onsets_by_measure', 'note_durs_by_measure', 'note_pitches_by_measure', 'note_velocities_by_measure', 'note_stresses_by_measure', 'lyric_summary_bart',
  ]

  ## clean & re-save
  for split in ['train', 'test', 'valid']:
    # clean lyrics
    data[split] = data[split].map(clean_lyrics, remove_columns=cols_to_axe)
    # chunk lyrics
    data[split] = chunk_pipeline(data=data[split],
                                 n_lines=n_lines,
                                 group=['midi_id', 'song_title', 'lyrics', 'genre', 'topic', 'lyric_summary_bartv2'])
    # get preceding lyric chunk (lag)
    data[split] = get_preceding_chunk(data[split])


    # data[split] = dataset[split].map(get_lyric_chunks, fn_kwargs={'chunk_size': 1}, remove_columns=['clean_lyrics'])
    # data[split] = dataset[split].map(explode_chunks,
    #                                     batched=True,
    #                                     fn_kwargs={'group': ['midi_id', 'song_title', 'lyrics', 'genre', 'topic', 'lyric_summary_bartv2']},
    #                                     remove_columns=['lyric_chunks'])


    # create inputs for modeling
    data[split] = data[split].map(create_kwsylgen_inputs)
    # filter out weird syll
    data[split] = data[split].filter(lambda example: example["sylls"] < 30)
    data[split] = data[split].filter(lambda example: example["sylls"] > 2)
    data[split] = data[split].filter(lambda example: len(example["keywords"]) > 0)
    # write to csv
    data[split].to_csv(dotenv[f'{hf_write_path}_{split}'])

  # # write to hf
  if write_data:
    print('shmoney!') if save_hf_dataset(data, dotenv, hf_write_path) else print("booooooooo!")

  return data

In [None]:
# @title kwsylchunk_preprocessing.py
'''
create_kwsylchunk_inputs():
    - hf mapping function
'''
def create_kwsylchunk_inputs(row, kw_extractor, n_kws):
  # get lines (should be n_lines)
  lines = get_lyric_lines(row['clean_lyrics'])
  # get sylls / line
  sylls = [str(count_line_syllables(line)) for line in lines]
  # create kw extractor
  # kw_extractor = yake.KeywordExtractor(n = 1, top = 4, dedupLim=0.8, dedupFunc='seqm')
  # extract keywords for training
  # kws_per_line = [[kw for kw in kw_extractor.extract_keywords(line)] for line in lines]
  # sort by score
  # for kws in kws_per_line: kws.sort(key=lambda l: l[1])
  kws = kw_extractor.extract_keywords(row['clean_lyrics'], keyphrase_ngram_range=(1, 1), stop_words='english', use_mmr=True, diversity=0.95, top_n=n_kws)
  # out of loop, grab word only
  kws = [kw[0] for kw in kws]
  # grab only the word, not the score
  # kws_per_line = [[kw[0] for kw in kws] for kws in kws_per_line if len(kws) >= 1]
  # convert to string
  # kws_string_per_line = [f"{', '.join(kws)}" for i, kws in enumerate(kws_per_line)]
  kws_string = ', '.join(kws)
  # create orig string
  # per_line = []
  # for i, kws in enumerate(kws_per_line):
  #   if (len(kws) > 0) and (int(sylls[i]) < 30):
  #     per_line.append(f"{kws_string_per_line[i]}. len{sylls[i]}")

  syls_per_line_string = ' \n '.join([f'len_{syll}' for syll in sylls])
  orig = f'''input {kws_string}. {syls_per_line_string}  output'''

  return  {'sylls': sylls, 'keywords': kws, 'orig': orig, 'target': row['clean_lyrics'], 'target_n_words': len(row['clean_lyrics'].split())}

'''
preprocess_kwsylchunk_data():
    - bread & butter
'''
def preprocess_kwsylchunk_data(dotenv, frac=0.1, n_lines=1, hf_write_path='kwsylchunk', write_data=False):
  # update write path if we change the chunking
  # hf_write_path = f'hf_write_path{n_lines}'
  df = pd.read_csv(dotenv['source_data'])
  # add in new lyric summaries / cleaned lyrics & merge
  summ_df = load_dataset('adamjweintraut/bart_cnn_lyric_summaries', split='etl').to_pandas()[['midi_id', 'clean_lyrics', 'lyric_summary_bartv2']]
  df = pd.merge(df, summ_df, on='midi_id')
  # load splits
  train_ids = pd.read_csv(f"{dotenv['processed']}/train_dataset.csv")[['midi_id', 'topic']]
  test_ids = pd.read_csv(f"{dotenv['processed']}/test_dataset.csv")[['midi_id', 'topic']]
  valid_ids = pd.read_csv(f"{dotenv['processed']}/valid_dataset.csv")[['midi_id', 'topic']]
  # create splits to save
  train_df, test_df, valid_df = tuple([pd.merge(df, id_set, on='midi_id') for id_set in [train_ids, test_ids, valid_ids]])
  # reduce
  train_df = train_df.sample(frac=frac, random_state=42)
  test_df = test_df.sample(frac=frac, random_state=42)
  valid_df = valid_df.sample(frac=frac, random_state=42)
  # save to csv first
  train_df.to_csv(dotenv[f'{hf_write_path}_train'])
  test_df.to_csv(dotenv[f'{hf_write_path}_test'])
  valid_df.to_csv(dotenv[f'{hf_write_path}_valid'])
  # delete to save ram
  del df, train_ids, test_ids, valid_ids, train_df, test_df, valid_df
  # load into hf dataset
  data = load_dataset('csv', data_files={'train': dotenv[f'{hf_write_path}_train'],
                                         'test': dotenv[f'{hf_write_path}_test'],
                                         'valid': dotenv[f'{hf_write_path}_valid']},
                             delimiter=',')
  # define cols we don't need
  cols_to_axe=[
    'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'Unnamed: 0_x',
    'msd_id', 'match_score', 'midi_filename', 'lpd_filename', 'lyrics_filename',
    'midi_end_time', 'midi_instruments', 'midi_program_numbers', 'lpd_resolution', 'lpd_tracks',
    'lpd_track_program_numbers', 'lpd_track_lengths', 'lyrics_time_events', 'lyrics_max_track_length',
    'artist', 'artist_terms', 'album', 'year_released', 'bpm', 'lyric_lang', 'syl_by_measure',
    'syl_count_by_measure', 'note_count_by_measure', 'measure_times', 'vocal_instrument_program', 'vocal_instrument_note_syl_corr',
    'vocal_instrument_name', 'note_onsets_by_measure', 'note_durs_by_measure', 'note_pitches_by_measure', 'note_velocities_by_measure', 'note_stresses_by_measure', 'lyric_summary_bart',
  ]

  ## clean & re-save
  for split in ['train', 'test', 'valid']:
    # clean lyrics
    data[split] = data[split].map(clean_lyrics, remove_columns=cols_to_axe)
    # chunk lyrics
    data[split] = chunk_pipeline(data[split], n_lines, ['midi_id', 'song_title', 'lyrics', 'genre', 'topic', 'lyric_summary_bartv2'])

    # create inputs for modeling
    data[split] = data[split].map(create_kwsylchunk_inputs, fn_kwargs={'kw_extractor': KeyBERT(),
                                                                       'n_kws': 2*n_lines})
    # write to csv
    if write_data:
      data[split].to_csv(dotenv[f'{hf_write_path}_{split}'])

  # # # write to hf
  if write_data:
    print('shmoney!') if save_hf_dataset(data, dotenv, hf_write_path) else print("booooooooo!")

  return data

----
# Run

In [None]:
# @title Preprocessing

# make sure we have updated environment variables (root defined in setup)
dotenv = dotenv_values(f'{ROOT}/.env')

## create our datasets (on hf & csv)
kwsylchunk = preprocess_kwsylchunk_data(dotenv, frac=1.0, n_lines=8, write_data=True)
# kwsg = preprocess_kwsylgen_data(dotenv, frac=1.0, write_data=True)
# loaf_data = preprocess_loaf_data(dotenv, write_data=True)
# lyrlen_data = preprocess_lyrlen_data(dotenv, n_examples_slice=3e4)
# lyrlen_line_data = preprocess_lyrlen_data(dotenv, n_examples_slice=5e3, hf_write_path='lyrlen', write_data=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/4489 [00:00<?, ? examples/s]

Map:   0%|          | 0/4489 [00:00<?, ? examples/s]

Map:   0%|          | 0/4489 [00:00<?, ? examples/s]

Map:   0%|          | 0/28382 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]

Map:   0%|          | 0/571 [00:00<?, ? examples/s]

Map:   0%|          | 0/3708 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

Map:   0%|          | 0/622 [00:00<?, ? examples/s]

Map:   0%|          | 0/3890 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

shmoney!


In [None]:
sample = kwsylchunk['train'].select(range(int(8)))

In [None]:
sample[3]

{'midi_id': '0715ff87446ca514712ad156bd630f15',
 'song_title': 'Never Ending Story (Club Mix)',
 'lyrics': 'Turn around look at what you see.\nIn her face the mirror of your dreams.\nMake believe on everywhere\nget it in the line.\nWritten on the pages\nIs the answer to a never ending story\nAhaha ahaha ahaha\nReach the stars fly a fantasy.\nDream a dream\nand what you see will be.\nRhymes that keep their secrets\nwill unfold behind the clouds\nAnd there upon the rainbow\nIs the answer to a never ending story\nAhaha ahaha ahaha\nStory ahaha ahaha ahaha\nShow your fear for she may fade away\nIn your hands the birth of a new day.\nRhymes that keep their secrets\nwill unfold behind the clouds\nAnd there upon the rainbow\nIs the answer to a never ending story\nAhaha ahaha ahaha\nNever ending story\nAhaha ahaha ahaha\nNever ending story\nAhaha ahaha ahaha\nNever ending story\nAhaha ahaha ahaha',
 'genre': 'dance',
 'clean_lyrics': 'ahaha ahaha ahaha \n never ending story \n ahaha ahaha ahah

In [None]:
clean_lyrics = 'turn around look at what you see.\nin her face the mirror of your dreams.\nmake believe on everywhere\nget it in the line.'
# get lines (should be n_lines)
lines = get_lyric_lines(clean_lyrics)
# get sylls / line
sylls = [str(count_line_syllables(line)) for line in lines]

In [None]:
sylls

['8', '9', '7', '5']

In [None]:
kw_extractor = KeyBERT()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
kws = kw_extractor.extract_keywords(sample[0]['clean_lyrics'],
                                    keyphrase_ngram_range=(2, 2),
                                    stop_words='english',
                                    use_mmr=True, diversity=0.95, top_n=4)
                                    # use_maxsum=True,
                                    # )
kws = [kw[0] for kw in kws]
kws

['make believe',
 'dreams',
 'line',
 'look',
 'turn',
 'make',
 'face mirror',
 'face']

In [None]:
kw_extractor = yake.KeywordExtractor(n = 2, top = 4, dedupLim=0.8, dedupFunc='seqm')
# extract keywords for training
kws_per_line = [[kw for kw in kw_extractor.extract_keywords(line)] for line in lines]

In [None]:
kws_per_line

[[('turn', 0.15831692877998726)],
 [('dreams', 0.15831692877998726),
  ('face', 0.29736558256021506),
  ('mirror', 0.29736558256021506)],
 [('make', 0.15831692877998726)],
 [('line', 0.15831692877998726)]]

In [None]:
for kws in kws_per_line: kws.sort(key=lambda l: l[1])

In [None]:
# grab only the word, not the score
kws_per_line = [[kw[0] for kw in kws] for kws in kws_per_line if len(kws) >= 1]

In [None]:
kws_string_per_line = [f"{', '.join(kws)}" for i, kws in enumerate(kws_per_line)]

In [None]:
kws_string_per_line

['turn', 'dreams, face, mirror', 'make', 'line']

In [None]:
per_line = []
for i, kws in enumerate(kws_per_line):
  if (len(kws) > 0) and (int(sylls[i]) < 30):
    per_line.append(f"{kws_string_per_line[i]}. len{sylls[i]}")
per_line_string = '\n'.join(per_line)
orig = f'''input {per_line_string} output'''

In [None]:
orig

'input turn. len8\ndreams, face, mirror. len9\nmake. len7\nline. len5 output'

In [None]:
# sort by score
# grab only the word, not the score
kws_per_line = [[kw[0] for kw in kws] for kws in kws_per_line if len(kws) >= 1]
# convert to string
kws_string_per_line = [f"{', '.join(kws)}" for i, kws in enumerate(kws_per_line)]


  # orig = f'''input {' '.join(kws)}. len_{sylls} output'''

  return  {'sylls': sylls, 'keywords': kws, 'orig': orig, 'target': row['clean_lyrics']}


In [None]:
sample_kwsyl = sample.map(create_kwsylchunk_inputs)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

ArrowTypeError: Expected bytes, got a 'numpy.float64' object

In [None]:
sample_kwsyl

In [None]:
# clean lyrics
data = kwsylchunk
split = 'test'
n_lines = 4

data[split] = data[split].map(clean_lyrics, remove_columns=cols_to_axe)
# chunk lyrics
data[split] = chunk_pipeline(data[split], n_lines, ['midi_id', 'song_title', 'lyrics', 'genre', 'topic', 'lyric_summary_bartv2'])

# create inputs for modeling
data[split] = data[split].map(create_kwsylchunk_inputs)
# filter out weird syll
# data[split] = data[split].filter(lambda example: example["sylls"] < 30)
# data[split] = data[split].filter(lambda example: example["sylls"] > 2)
# data[split] = data[split].filter(lambda example: len(example["keywords"]) > 0)
# write to csv
# data[split].to_csv(dotenv[f'{hf_write_path}_{split}'])

ValueError: Column to remove ['artist_terms', 'lyric_summary_bart', 'note_velocities_by_measure', 'note_stresses_by_measure', 'lpd_track_program_numbers', 'syl_by_measure', 'syl_count_by_measure', 'Unnamed: 0_x', 'vocal_instrument_name', 'note_pitches_by_measure', 'measure_times', 'artist', 'note_count_by_measure', 'note_onsets_by_measure', 'Unnamed: 0.1', 'lpd_track_lengths', 'lyric_lang', 'lyrics_time_events', 'midi_filename', 'vocal_instrument_program', 'midi_end_time', 'midi_program_numbers', 'match_score', 'Unnamed: 0', 'vocal_instrument_note_syl_corr', 'album', 'lpd_resolution', 'note_durs_by_measure', 'bpm', 'lpd_tracks', 'Unnamed: 0.2', 'lyrics_max_track_length', 'lpd_filename', 'midi_instruments', 'msd_id', 'lyrics_filename', 'year_released'] not in the dataset. Current columns in the dataset: ['midi_id', 'song_title', 'lyrics', 'genre', 'clean_lyrics', 'lyric_summary_bartv2', 'topic', 'lyric_chunk_n']

In [None]:
datap

<br>

<br>

<br>

<br>

## Testing

In [None]:
one_row = dataset['train'][0]

NameError: name 'dataset' is not defined

In [None]:
one_row.keys()

NameError: name 'one_row' is not defined

In [None]:
one_row['syl_count_by_measure'] = ast.literal_eval(one_row['syl_count_by_measure'])
# one_row['note_onsets_by_measure'] = ast.literal_eval(one_row['note_onsets_by_measure'])
one_row['note_durs_by_measure'] = ast.literal_eval(one_row['note_durs_by_measure'])
one_row['note_pitches_by_measure'] = ast.literal_eval(one_row['note_pitches_by_measure'])
one_row['note_stresses_by_measure'] = ast.literal_eval(one_row['note_stresses_by_measure'])
one_row['note_velocities_by_measure'] = ast.literal_eval(one_row['note_velocities_by_measure'])

In [None]:
one_row["note_onsets_by_measure"][:2]

[[11.02294921875, 11.26953125, 11.4990234375, 11.9482421875, 12.3974609375],
 [12.6416015625,
  14.0185546875,
  14.2578125,
  14.47265625,
  14.70703125,
  15.4052734375,
  15.654296875,
  16.083984375,
  16.2841796875]]

In [None]:
"<P> " + " <P> ".join([' '.join([f'onset_{np.round(float(val),1)}' for val in line]) for line in one_row["note_onsets_by_measure"]])

'<P> onset_11.0 onset_11.3 onset_11.5 onset_11.9 onset_12.4 <P> onset_12.6 onset_14.0 onset_14.3 onset_14.5 onset_14.7 onset_15.4 onset_15.7 onset_16.1 onset_16.3 <P> onset_16.7 onset_17.3 onset_17.8 onset_18.0 onset_18.3 onset_18.5 onset_18.7 onset_19.2 <P> onset_19.5 onset_20.4 onset_20.6 onset_21.1 onset_21.3 onset_21.8 <P> onset_22.0 onset_25.3 onset_25.6 onset_25.8 onset_26.0 <P> onset_26.4 onset_29.0 onset_29.3 onset_29.5 onset_29.7 onset_30.4 onset_30.7 onset_31.1 onset_31.4 <P> onset_31.7 onset_32.1 onset_32.4 onset_32.6 onset_33.0 onset_33.3 onset_33.5 <P> onset_33.9 onset_35.2 onset_35.4 onset_35.6 onset_36.1 onset_36.3 onset_36.7 <P> onset_37.0 onset_40.8 onset_41.0 onset_41.3 onset_41.7 onset_42.0 onset_42.8 onset_43.1 onset_43.4 onset_43.7 <P> onset_44.1 onset_44.5 onset_44.8 onset_45.0 onset_45.5 onset_45.7 onset_46.2 <P> onset_46.4 onset_48.3 onset_48.5 onset_48.7 onset_49.0 onset_49.2 onset_49.5 onset_49.9 onset_50.2 <P> onset_50.9 onset_51.8 onset_52.0 onset_52.3 onset

In [None]:
one_row['lyrics']

'Goodbye Norma Jean\nThough I never knew you at all\nYou had the grace to hold yourself\nWhile those around you crawled\nThey crawled out of the woodwork\nAnd they whispered into your brain\nThey set you on the treadmill\nAnd they made you change your name\nAnd it seems to me you lived your life\nLike a candle in the wind\nNever knowing who to cling to\nWhen the rain set in\nAnd I would have\nliked to have known you\nBut I was just a kid\nYour candle burned out long before\nYour legend ever did\nLoneliness was tough\nThe toughest role you ever played\nHollywood created a superstar\nAnd pain was the price you paid\nEven when you died\nOh the press still hounded you\nAll the papers had to say\nWas that Marilyn was found in the nude\nAnd it seems to me you lived your life\nLike a candle in the wind\nNever knowing who to cling to\nWhen the rain set in\nAnd I would have\nliked to have known you\nBut I was just a kid\nYour candle burned out long before\nYour legend ever did\nGoodbye Norma Je

In [None]:
"<P> " + " <P> ".join(one_row['lyrics'].split('\n'))

'<P> Goodbye Norma Jean <P> Though I never knew you at all <P> You had the grace to hold yourself <P> While those around you crawled <P> They crawled out of the woodwork <P> And they whispered into your brain <P> They set you on the treadmill <P> And they made you change your name <P> And it seems to me you lived your life <P> Like a candle in the wind <P> Never knowing who to cling to <P> When the rain set in <P> And I would have <P> liked to have known you <P> But I was just a kid <P> Your candle burned out long before <P> Your legend ever did <P> Loneliness was tough <P> The toughest role you ever played <P> Hollywood created a superstar <P> And pain was the price you paid <P> Even when you died <P> Oh the press still hounded you <P> All the papers had to say <P> Was that Marilyn was found in the nude <P> And it seems to me you lived your life <P> Like a candle in the wind <P> Never knowing who to cling to <P> When the rain set in <P> And I would have <P> liked to have known you <P>

In [None]:
 "<P> " + " <P> ".join([' '.join([f'stress_{val}' for val in line]) for line in one_row["note_stresses_by_measure"]]),

('<P> stress_1 stress_1 stress_1 stress_1 stress_0 <P> stress_0 stress_2 stress_2 stress_2 stress_2 stress_2 stress_2 stress_2 stress_1 <P> stress_1 stress_1 stress_2 stress_1 stress_1 stress_1 stress_2 stress_1 <P> stress_0 stress_1 stress_1 stress_1 stress_1 stress_1 <P> stress_2 stress_2 stress_2 stress_2 stress_2 <P> stress_2 stress_2 stress_2 stress_2 stress_2 stress_2 stress_2 stress_2 stress_1 <P> stress_2 stress_1 stress_2 stress_2 stress_2 stress_2 stress_2 <P> stress_1 stress_1 stress_2 stress_2 stress_2 stress_2 stress_2 <P> stress_1 stress_2 stress_2 stress_2 stress_1 stress_2 stress_1 stress_2 stress_2 stress_2 <P> stress_2 stress_2 stress_1 stress_2 stress_1 stress_1 stress_1 <P> stress_1 stress_2 stress_2 stress_2 stress_1 stress_1 stress_2 stress_2 stress_2 <P> stress_2 stress_2 stress_2 stress_2 stress_2 <P> stress_2 stress_2 stress_2 stress_2 <P> stress_0 stress_2 stress_2 stress_2 <P> stress_2 stress_2 stress_2 stress_1 stress_2 stress_2 <P> stress_2 stress_1 stress_

In [None]:
 "<P> " + " <P> ".join([' '.join([f'pitch_{val}' for val in line]) for line in one_row["note_pitches_by_measure"]]),


('<P> pitch_56 pitch_56 pitch_56 pitch_56 pitch_54 <P> pitch_56 pitch_52 pitch_54 pitch_56 pitch_52 pitch_61 pitch_61 pitch_62 pitch_61 <P> pitch_59 pitch_59 pitch_61 pitch_59 pitch_61 pitch_59 pitch_59 pitch_52 <P> pitch_52 pitch_56 pitch_57 pitch_56 pitch_54 pitch_56 <P> pitch_49 pitch_52 pitch_54 pitch_56 pitch_59 <P> pitch_59 pitch_52 pitch_54 pitch_56 pitch_52 pitch_61 pitch_61 pitch_62 pitch_61 <P> pitch_59 pitch_59 pitch_61 pitch_61 pitch_61 pitch_59 pitch_59 <P> pitch_52 pitch_56 pitch_56 pitch_57 pitch_56 pitch_54 pitch_56 <P> pitch_49 pitch_56 pitch_57 pitch_59 pitch_59 pitch_59 pitch_56 pitch_59 pitch_59 pitch_59 <P> pitch_56 pitch_56 pitch_57 pitch_59 pitch_59 pitch_59 pitch_61 <P> pitch_52 pitch_56 pitch_57 pitch_59 pitch_59 pitch_59 pitch_59 pitch_59 pitch_61 <P> pitch_59 pitch_57 pitch_56 pitch_56 pitch_54 <P> pitch_54 pitch_59 pitch_59 pitch_61 <P> pitch_61 pitch_61 pitch_63 pitch_64 <P> pitch_63 pitch_63 pitch_61 pitch_59 pitch_61 pitch_61 <P> pitch_56 pitch_56 pitch_5

In [None]:
"<P> " + " <P> ".join([' '.join([f'vel_{val}' for val in line]) for line in one_row["note_velocities_by_measure"]]),


('<P> vel_103 vel_103 vel_107 vel_101 vel_90 <P> vel_92 vel_126 vel_127 vel_111 vel_114 vel_112 vel_112 vel_113 vel_106 <P> vel_109 vel_109 vel_112 vel_98 vel_101 vel_100 vel_113 vel_97 <P> vel_89 vel_97 vel_99 vel_101 vel_105 vel_97 <P> vel_113 vel_114 vel_120 vel_120 vel_120 <P> vel_125 vel_127 vel_121 vel_127 vel_123 vel_127 vel_123 vel_115 vel_110 <P> vel_118 vel_104 vel_114 vel_123 vel_120 vel_113 vel_120 <P> vel_98 vel_107 vel_113 vel_114 vel_114 vel_113 vel_117 <P> vel_105 vel_127 vel_119 vel_127 vel_96 vel_127 vel_109 vel_125 vel_116 vel_119 <P> vel_127 vel_126 vel_100 vel_113 vel_96 vel_110 vel_110 <P> vel_104 vel_127 vel_115 vel_126 vel_108 vel_108 vel_123 vel_112 vel_127 <P> vel_127 vel_118 vel_123 vel_126 vel_118 <P> vel_117 vel_127 vel_114 vel_127 <P> vel_88 vel_127 vel_123 vel_127 <P> vel_123 vel_113 vel_116 vel_107 vel_121 vel_113 <P> vel_126 vel_109 vel_127 vel_109 vel_118 vel_120 vel_117 vel_96 <P> vel_110 vel_105 vel_113 vel_107 vel_109 vel_109 <P> vel_107 vel_127 vel