# Imports

In [None]:
%pip install pretty_midi

Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592287 sha256=e6b9cb12888ed5a393d1a33e8c34e2de91b67462c257bdbed1c31f56ebdd7442
  Stored in directory: /root/.cache/pip/wheels/cd/a5/30/7b8b7f58709f5150f67f98fde4b891ebf0be9ef07a8af49f25
Successfully built pretty_midi
Installing collected packages: mido, pretty_midi
Successfu

In [None]:
import os
import sys
sys.path.append("../")
import yaml
import json
import numpy as np
import torch
import math
import copy
import re
import pretty_midi
from transformers import AutoModel, AutoTokenizer


In [None]:
!nvidia-smi

Fri Dec  6 09:46:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Utils

In [None]:
base_tones = {
    'C' : 0, 'C#': 1, 'D' : 2, 'D#': 3,
    'E' : 4, 'F' : 5, 'F#': 6, 'G' : 7,

    'G#': 8, 'A' : 9, 'A#':10, 'B' :11,
}
line_index = {
    0: 'first', 1 : 'second', 2: 'third',
    3 : 'fourth', 4 : 'fifth',
    5: 'sixth', 6 : 'seventh',
    7: 'eighth', 8 : 'ninth', 9: 'tenth',
}


def log_discretize(x, bins=512):
    eps = 1
    x_min = np.log(eps-0.3)
    x_max = np.log(6+eps)
    x = min(6, x)
    x = max(-0.3, x)
    x = np.log(x+eps)
    x = (x-x_min) / (x_max-x_min) * (bins-1)
    return np.round(x).astype(int)

def reverse_log_float(x, bins=512):
    if x == 79:
        return 0
    eps = 1
    x_min = np.log(eps-0.3)
    x_max = np.log(6+eps)
    x = x * (x_max - x_min)/(bins-1) + x_min
    x = np.exp(x) - eps
    return float("{:.3f}".format(x))

def bin_time(list_d):
    bin_list = []
    for item in list_d:
        if not isinstance(item, str):
            item = str(item)
        item_tuple = item.split(' ')
        out = ''
        for item_str in item_tuple:
            item_num = float(item_str)
            # out += f'<{item_num}>'
            bin = log_discretize(item_num)
            out += f'<{bin}>'
        bin_list.append(out)
    return bin_list

def append_song_token(model, tokenizer, config):
    old_token_len = len(tokenizer)
    new_tokens = ['<bol>','<bom>','<bop>','<eol>','<eom>','<eop>']
    for note in base_tones:
        for i in range(-1, 10): # -1 -> 9
            new_tokens.append(f'<{note}{i}>')
    for t_bin in range(512):
        new_tokens.append(f'<{t_bin}>')
    new_tokens = set(new_tokens) - set(tokenizer.get_vocab().keys())
    new_tokens = list(new_tokens)
    new_tokens.sort()
    tokenizer.add_tokens(new_tokens)
    new_token_len = len(tokenizer)
    model.tokenizer = tokenizer

    weight = nn.Parameter(torch.empty((new_token_len-old_token_len, config.hidden_size)))
    nn.init.kaiming_uniform_(weight, a=math.sqrt(5))
    model.config.vocab_size = new_token_len
    model.output.weight.data = torch.cat([model.output.weight, weight.to(model.device)], dim=0)
    model.output.weight.requires_grad = True

    new_token_embed = torch.randn(new_token_len-old_token_len, config.hidden_size)
    new_weight = torch.cat([model.model.tok_embeddings.weight, new_token_embed.to(model.device)], dim=0)
    model.model.vocab_size = new_token_len
    model.model.tok_embeddings.weight.data = new_weight
    model.model.tok_embeddings.weight.requires_grad = True
    return model, tokenizer


def tuple2dict(line):
    order_string = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
    line = line.replace(" ", "")
    line = line.replace("\n", "")
    line = re.sub(r'\. |\.', '', line)
    # line = re.sub(r'The\d+line:', ' |', line)
    for string in order_string:
        line = line.replace(f'The{string}line:', ' |')
    special_pattern = r'<(.*?)>'
    song = {'lyrics':[], 'notes':[], 'notes_duration':[], 'rest_duration':[], 'pitch':[], 'notes_dict': [], 'rest_dict': []}

    for item in line.split('|')[1:]:
        x = item.split(',')
        notes = re.findall(special_pattern,x[1])
        note_ds = re.findall(special_pattern,x[2])
        rest_d = re.findall(special_pattern,x[3])[0]
        assert len(notes)== len(note_ds), f"notes:{'|'.join(notes)}, note_ds:{'|'.join(note_ds)}"
        for i in range(len(notes)):
            if i == 0:
                song['lyrics'].append(x[0])
            else:
                song['lyrics'].append('-')
            song['notes'].append(notes[i])
            song['pitch'].append(int(pretty_midi.note_name_to_number(notes[i])))
            song['notes_duration'].append(reverse_log_float(int(note_ds[i])))
            song['notes_dict'].append(int(note_ds[i]))
            if i == len(notes)-1:
                song['rest_duration'].append(reverse_log_float(int(rest_d)))
                song['rest_dict'].append(int(rest_d))
            else:
                song['rest_duration'].append(0)
                song['rest_dict'].append(0)
    return song

def dict2midi(song):
    # new_midi = pretty_midi.PrettyMIDI(charset="utf-8")#
    new_midi = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)
    # print(len(song["notes"]))
    current_time = 0  # Time since the beginning of the song, in seconds
    pitch = []
    for i in range(0, len(song["notes"])):
        #add notes
        notes_duration = song["notes_duration"][i]
        note_obj = pretty_midi.Note(velocity=100, pitch=int(pretty_midi.note_name_to_number(song["notes"][i])), start=current_time,
                                end=current_time + notes_duration)
        instrument.notes.append(note_obj)
        #add lyrics
        # lyric_event = pretty_midi.Lyric(text=str(song["lyrics"][i])+ "\0", time=current_time)
        # new_midi.lyrics.append(lyric_event)
        current_time +=  notes_duration + song["rest_duration"][i]# Update of the time

    new_midi.instruments.append(instrument)
    lyrics = ' '.join(song["lyrics"])
    return new_midi, lyrics


def gen_midi(line, file_name):
    song  = tuple2dict(line)
    #song['lyrics'] = ['I','-','you','-','I','-','you','-','I','-','you','-','he','-']
    new_midi, lyrics = dict2midi(song)

    # save midi file and lyric text
    new_midi.write(file_name+'.mid')

    with open(file_name+'.txt', "w") as file:
        file.write(lyrics)
    print(f'midi saved at ~/{file_name}.mid, lyrics saved at ~/{file_name}.txt')

# Running and Testing

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
ckpt_path = "Mar2Ding/songcomposer_sft"
tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True)
model = AutoModel.from_pretrained(ckpt_path, trust_remote_code=True).cuda().half()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

tokenization_internlm.py:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Mar2Ding/songcomposer_sft:
- tokenization_internlm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

configuration_internlm.py:   0%|          | 0.00/7.49k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Mar2Ding/songcomposer_sft:
- configuration_internlm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_internlm2.py:   0%|          | 0.00/57.8k [00:00<?, ?B/s]

build_mlp.py:   0%|          | 0.00/7.89k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Mar2Ding/songcomposer_sft:
- build_mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Mar2Ding/songcomposer_sft:
- modeling_internlm2.py
- build_mlp.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin.index.json:   0%|          | 0.00/46.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/6.70G [00:00<?, ?B/s]

Set max length to 2048


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

**Format Notation**

\<bop\>Paired data\<eop\>. \<bop\> stands for the **b**eginning **o**f the **p**air. \<eop\> stands for the **e**nd **o**f the **p**air.

\<bom\>Pure melody\<eom\>. \<bom\> stands for the **b**eginning **o**f the **m**elody. \<eom\> stands for the **e**nd **o**f the **m**elody.

\<bol\>Pure lyric\<eol\>. \<bol\> stands for the **b**eginning **o**f the **l**yric. \<eol\> stands for the **e**nd **o**f the **l**yric.

The conversation format would be:

[UNUSED_TOKEN_146]user\n**Question**[UNUSED_TOKEN_145]\n

[UNUSED_TOKEN_146]assistant\n**Answer**[UNUSED_TOKEN_145]\n

In [None]:
####### For sft model ########
####### l2m #######
prompt1 = 'Compose a tune in harmony with the accompanying lyrics. <bol> Total 6 lines.\
The first line:轻|轻|笑|声|在|为|我|送|温|暖\n\
The second line:你|为|我|注|入|快|乐|强|电\n\
The third line:轻|轻|说|声|漫|长|路|快|要|走|过\n\
The fourth line:终|于|走|到|明|媚|晴|天\n\
The fifth line:声|声|欢|呼|跃|起|像|红|日|发|放|金|箭\n\
The sixth line:我|伴|你|往|日|笑|面|重|现\n<eol>'
####### m2l #######
prompt2 = 'Create lyrics to accompany the given melody. <bom> Total 7 lines.\
The first line:<D4>,<141>,<79>|<F4>,<151>,<79>|<G4>,<172>,<79>|<A4>,<147>,<112>|<D4>,<147>,<79>|<G4>,<166>,<79>|<A4>,<172>,<79>|<C5>,<144>,<79>|<A4>,<268>,<212>\n\
The second line:<D4>,<141>,<79>|<F4>,<154>,<79>|<G4>,<169>,<79>|<A4>,<144>,<108>|<D4>,<151>,<79>|<G4>,<151>,<137>\n\
The third line:<E4>,<151>,<88>|<E4>,<141>,<79>|<F4>,<144>,<79>|<E4>,<137>,<79>|<F4>,<141>,<79>|<G4>,<130>,<79>|<A4>,<157>,<79>\n\
The fourth line:<G4>,<144>,<79>|<A4>,<137>,<79>|<D4>,<160>,<141>|<G4>,<144>,<79>|<A4>,<134>,<79>|<D4>,<200>,<79>|<C4>,<194>,<104>|<D4>,<264>,<154>\n\
The fifth line:<A4>,<141>,<79>|<C5>,<147>,<79>|<A4>,<157>,<79>|<G4>,<224>,<92>|<A4>,<151>,<79>|<G4> <F4>,<189><134>,<79>|<D4>,<197>,<79>|<A4>,<233>,<79>\n\
The sixth line:<A4>,<144>,<79>|<B4>,<141>,<79>|<A4>,<154>,<79>|<G4>,<237>,<79>|<A4>,<137>,<79>|<G4>,<120>,<79>|<G4>,<120>,<144>|<D5>,<233>,<96>|<A4>,<226>,<154>\n\
The seventh line:<G4> <A4>,<154><104>,<79>|<C5>,<175>,<79>|<D5>,<137>,<79>|<C5>,<151>,<79>|<A4>,<151>,<79>|<C5>,<151>,<79>|<G4>,<137>,<154>\n<eom>'
####### song cont #######
prompt3 = 'Continue the existing song script by adding 2 additional lines. <bop> Total 2 lines.\
The first line:当,<B3>,<147>,<79>|春,<D#4>,<144>,<79>|天,<E4>,<157>,<144>|为,<E4> <F#4>,<134><147>,<88>|我,<F#4>,<137>,<88>|冒,<F#4>,<147>,<79>|着,<B4>,<197>,<79>|雨,<D#4>,<144>,<79>|到,<F#4>,<157>,<79>|达,<E4>,<160>,<79>\n\
The second line:只,<E4>,<116>,<116>|为,<E4> <E4>,<147><144>,<79>|了,<G#4> <A4>,<134><154>,<79>|带,<G#4> <F#4>,<88><108>,<79>|我,<G#4> <F#4>,<127><134>,<79>|去,<E4>,<151>,<79>|往,<C#4>,<202>,<235>\n<eop>'
####### text-to-song #######
prompt4 = 'Create a song on brave and sacrificing with a rapid pace.'
###### Inference function would generation a three-shot answer. Find the best fit one.##########
infence_result = model.inference(prompt4, tokenizer)
print(infence_result)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Create a song on brave and sacrificing with a rapid pace.
------attempt 0------
e a song on brave and sacrificing with a rapid pace.[UNUSED_TOKEN_145]
[UNUSED_TOKEN_146]assistant
The song is as follows. <bop> Total 6 lines. The first line:he, <B4> , <127> , <79> |left, <C#5> , <127> , <79> |the, <E5> , <127> , <79> |coun, <E5> , <214> , <79> |ty, <E5> , <166> , <79> |liv, <E5> , <127> , <79> |ing, <G#4> , <324> , <132> The second line:he, <B4> , <127> , <79> |had, <C#5> , <127> , <79> |to, <E5> , <127> , <79> |run, <E5> , <214> , <79> |and, <E5> , <166> , <79> |hide, <G#4> , <226> , <241> The third line:he, <B4> , <127> , <79> |could, <C#5> , <127> , <79> |not, <E5> , <127> , <79> |face, <E5> , <214> , <79> |it, <E5> , <166> , <79> |an, <E5> , <127> , <79> |y, <G#4> , <307> , <132> The fourth line:the, <A4> , <127> , <79> |law, <D5> , <127> , <79> |was, <C#5> , <127> , <79> |on, <C#5> , <127> , <79> |his, <D5> , <127> , <79> |track, <C#5> , <127> , <79> |now, <C#5> , <127> , <79> |he, 

In [None]:
######We provide the code for parsing the string-like song into the MIDI file and lyric txt.
line = 'The first line:勇, <E4> , <154> , <88> |敢, <E4> , <134> , <88> |地, <E4> , <137> , <79> |去, <F#4> , <151> , <79> |相, <E4> , <154> , <79> |信, <D#4> , <154> , <79> |自, <C#4> , <157> , <79> |己, <B3> , <172> , <127> The second line:坚, <E4> , <151> , <88> |定, <E4> , <137> , <88> |地, <E4> , <137> , <79> |向, <F#4> , <151> , <79> |前, <E4> , <151> , <79> |奔, <D#4> , <160> , <79> |跑, <C#4> , <157> , <79> The third line:哪, <B3> , <151> , <79> |里, <G#3> , <137> , <79> |会, <B3> , <151> , <79> |有, <G#3> , <189> , <79> |风, <F#3> , <157> , <79> |雨, <G#3> , <137> , <79> The fourth line:不, <G#3> , <147> , <79> |回, <F#3> , <144> , <79> |头, <E3> , <151> , <79> |的, <F#3> , <141> , <79> |笑, <G#3> , <166> , <79> |着, <B3> , <219> , <160> The fifth line:每, <E4> , <154> , <88> |一, <E4> , <130> , <88> |个, <E4> , <144> , <79> |起, <F#4> , <147> , <79> |点, <E4> , <157> , <79> |都, <D#4> , <154> , <79> |是, <C#4> , <151> , <79> |我, <B3> , <118> , <79> |们, <B3> , <118> , <79> |成, <G#3> , <207> , <79> |功, <B3> , <205> , <79> |的, <G#3> , <205> , <79>'
gen_midi(line, 'text')

# Collect Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def extract_lines(input_text):
    # Split the input into attempts based on markers
    attempts = re.split(r'------attempt \d+------', input_text)

    results = []  # To store lines for each attempt

    for attempt in attempts[1:]:
        if not attempt.strip():  # Skip empty sections
            continue

        # Find all "The Xth line:" and their associated text
        lines = re.findall(r"(The \w+ line:.*?\n.*?)(?=The \w+ line:|$)", attempt, re.DOTALL)
        # Clean up whitespace and store
        if len(lines) == 0:
          continue
        results.append([line.strip() for line in lines][0])
    return results

In [None]:
captured_output = io.StringIO()
sys.stdout = captured_output
print("banana")
sys.stdout = sys.__stdout__
with open("/content/drive/MyDrive/tempIO.txt","w") as f2:
  f2.write(captured_output.getvalue().strip())


In [None]:
import io

emotions = ["joy","sadness","anger","fear","disgust"]
generated_dict = {emotion:[] for emotion in emotions}
for emotion in emotions:
  prompt = f"Create a song that conveys the emotion of {emotion}."
  for i in range(15):
    captured_output = io.StringIO()
    sys.stdout = captured_output
    model.inference(prompt, tokenizer)
    sys.stdout = sys.__stdout__
    response = captured_output.getvalue().strip()
    results = extract_lines(response)
    for result in results:
      try:
        di = tuple2dict(result)
      except IndexError:
        continue
      di["full_text"] = result
      generated_dict[emotion].append(di)
    del response,results
    gc.collect()
    torch.cuda.empty_cache()


with open("/content/drive/MyDrive/generated_data.json","w") as f:
  json.dump(generated_dict,f)
