In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from encode_data import *
from midi_data import *

In [3]:
from tqdm import tqdm
import pandas as pd
from data_sources import process_parallel

In [4]:
from collections import Counter

## Encode music21 stream to text representation 

This notebook uses a full component format 
- measure separators, instruments, and separated octaves
- Format: note, octave, action type, instrument
- note repr: nG# o4 t1 i0

### Load midi data

In [5]:
path = Path('data/midi')

In [6]:
source_dir = 'midi_transform_v1'
out_dir = 'midi_transcribe_v1'
source_csv = path/source_dir/f'{source_dir}.csv'
out_csv = path/out_dir/f'{out_dir}.csv'

In [7]:
df = pd.read_csv(source_csv, index_col=0); df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,index,artist,bpm,genres,ht_bpm,ht_key,ht_mode,ht_offset,ht_time_signature,inferred_key,...,midi,midi_title,parts,seconds,section,song_url,source,time_signature,title,midi_transform_v1
0,0,wayne-sharpe,128.0,,128.0,C,1.0,0.0,4.0,C major,...,data/midi/midi_sources/hooktheory/pianoroll/w/...,yu-gi-oh3,"intro,chorus",15.0,chorus,https://www.hooktheory.com/theorytab/view/wayn...,hooktheory,4/4,yu-gi-oh-theme-song,data/midi/midi_transform_v1/hooktheory/pianoro...
1,1,weezer,,,128.0,C,,0.0,4.0,,...,data/midi/midi_sources/hooktheory/pianoroll/w/...,My New Song,intro-and-verse,,intro-and-verse,https://www.hooktheory.com/theorytab/view/weez...,hooktheory,,beverly-hills,
2,2,weezer,108.0,,108.0,Eb,1.0,-3.0,4.0,E- major,...,data/midi/midi_sources/hooktheory/pianoroll/w/...,falling for you intro,"intro,verse,chorus,solo",11.111111,intro,https://www.hooktheory.com/theorytab/view/weez...,hooktheory,4/4,falling-for-you,data/midi/midi_transform_v1/hooktheory/pianoro...
3,3,weezer,121.0,"Pop,Rock",121.0,Ab,1.0,4.0,4.0,A- major,...,data/midi/midi_sources/hooktheory/pianoroll/w/...,Buddy Holly,"verse,pre-chorus,chorus,bridge,solo",43.38843,solo,https://www.hooktheory.com/theorytab/view/weez...,hooktheory,4/4,buddy-holly,data/midi/midi_transform_v1/hooktheory/pianoro...
4,4,wavves,180.0,,180.0,A,1.0,3.0,4.0,B minor,...,data/midi/midi_sources/hooktheory/pianoroll/w/...,dreams of grandeur,verse,21.333333,verse,https://www.hooktheory.com/theorytab/view/wavv...,hooktheory,4/4,dreams-of-grandeur,data/midi/midi_transform_v1/hooktheory/pianoro...


In [8]:
df_filtered = df.loc[df['time_signature'] == '4/4']; df_filtered.shape

(30201, 22)

In [13]:
def transcribe_file(idxrow):
    idx,row = idxrow
    midi_file = row[source_dir]
    if not isinstance(midi_file, str) or not Path(midi_file).exists(): return idx,None
    out_file = Path(midi_file.replace(f'/{source_dir}/', f'/{out_dir}/')).with_suffix('.txt')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): return idx,str(out_file)
    try:
        seq = midi2seq(midi_file)
        seq_comp = remove_seq_rests(trim_seq_rests(seq))
        delta_trim = len(seq) - len(seq_comp)
        if delta_trim > 100: print(f'Removed {delta_trim} rests from {midi_file}')
        string_repr = seq2str(seq)
        with open(out_file, 'w') as tf:
            tf.write(string_repr)
    except Exception as e:
        print('Error converting midi to sequence', e)
        return idx,None
    return idx,str(out_file)

In [14]:
for r in df_filtered.iterrows():
    transcribe_file(r)

Error converting midi to sequence badly formated midi bytes, got: b''
Error converting midi to sequence badly formated midi bytes, got: b''
Trimming rests. Start, end: 432 4147 48
Compressing rests: 640 -> 16
Removed 1240 rests from data/midi/midi_transform_v1/freemidi/genre-dance-eletric/New Order - Perfect Kiss.mid
Trimming rests. Start, end: 28 1379 48
Trimming rests. Start, end: 4 1625 48
Trimming rests. Start, end: 0 771 48
Trimming rests. Start, end: 32 547 48
Trimming rests. Start, end: 28 1687 48
Trimming rests. Start, end: 0 1577 48
Trimming rests. Start, end: 48 1122 48
Removed 224 rests from data/midi/midi_transform_v1/freemidi/genre-dance-eletric/Cher - Half Breed.mid
Trimming rests. Start, end: 0 1706 48
Removed 163 rests from data/midi/midi_transform_v1/freemidi/genre-dance-eletric/Alanis Morissette - You Oughta Know.mid
Trimming rests. Start, end: 12 1491 48
Trimming rests. Start, end: 0 1027 48


KeyboardInterrupt: 

In [11]:
transcribed_files = process_parallel(transcribe_file, df_filtered.iterrows(), total=df_filtered.shape[0])

Compressing rests: 24 -> 16
Compressing rests: 24 -> 16
Compressing rests: 24 -> 16
Compressing rests: 24 -> 16
Compressing rests: 24 -> 16
Removed 134 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/j/jacob-mann/kogi/intro_key.mid
Removed 101 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/j/john-williams/the-raiders-march---indiana-jones-theme/intro-and-verse_key.mid
Compressing rests: 32 -> 16
Compressing rests: 28 -> 16
Compressing rests: 24 -> 16
Compressing rests: 24 -> 16
Compressing rests: 52 -> 16
Removed 129 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/b/boyinaband/lets-summon-the-devil/instrumental_key.mid
Removed 136 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/b/bruno-mars/uptown-funk/intro-and-verse_key.mid
Compressing rests: 24 -> 16
Compressing rests: 24 -> 16
Removed 153 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/d/dawid-podsiadlo/trojkaty-i-kwadraty/verse_key.mid
Removed 129 rests from data/midi/mi

KeyboardInterrupt: 

Compressing rests: 22 -> 18
Compressing rests: 22 -> 18
Compressing rests: 32 -> 16
Compressing rests: 32 -> 16
Compressing rests: 36 -> 16
Removed 641 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/t/the-wailers/roots-rock-reggae/intro-and-verse_key.mid
Compressing rests: 22 -> 18
Removed 137 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/t/the-wrecks/favorite-liar/bridge_key.mid
Compressing rests: 32 -> 16
Compressing rests: 34 -> 18
Removed 101 rests from data/midi/midi_transform_v1/hooktheory/pianoroll/m/mstrkrft/easy-love/intro_key.mid
Compressing rests: 28 -> 16
Compressing rests: 32 -> 16
Compressing rests: 26 -> 18
Compressing rests: 24 -> 16
Removed 141 rests from data/midi/midi_transform_v1/freemidi/genre-dance-eletric/Prodigy - Spitfire.mid


In [None]:
tdf = pd.DataFrame(data={out_dir: list(transcribed_files.values())}, index=list(transcribed_files.keys()))

In [None]:
merged_df = df.join(tdf, how='outer'); tdf.shape, df.shape, merged_df.shape

In [None]:
merged_df.to_csv(out_csv, index=False); merged_df.head()

### Calculate timesteps

In [None]:
merged_df = pd.read_csv(out_csv)

In [None]:
def calc_timesteps(idxrow):
    idx,row = idxrow
    text_file = row[out_dir]
    if not isinstance(text_file, str) or not Path(text_file).exists(): return idx,None
    try:
        with open(text_file, 'r') as f:
            text = f.read()
        vocab_count = Counter(text.split(' '))
        timesteps = vocab_count[TSEP]
        return idx,timesteps
    except Exception as e:
        print('Error reading text', e)
    return idx, None

In [None]:
file2steps = process_parallel(calc_timesteps, merged_df.iterrows(), total=merged_df.shape[0])

In [None]:
len_df = pd.DataFrame(data={f'{out_dir}_timesteps': list(file2steps.values())}, index=list(file2steps.keys()))
merged_len_df = merged_df.join(len_df, how='outer');
len_df.shape, merged_df.shape, merged_len_df.shape

In [None]:
merged_len_df.to_csv(out_csv, index=False); merged_len_df.head()