In [4]:
pip install torch fastai music21 pebble fluidsynth midi2audio

Collecting torch
  Using cached torch-1.8.1-cp38-cp38-win_amd64.whl (190.5 MB)
Collecting fastai
  Using cached fastai-2.3.0-py3-none-any.whl (193 kB)
Collecting music21
  Using cached music21-6.7.1.tar.gz (19.2 MB)
Collecting pebble
  Using cached Pebble-4.6.1-py2.py3-none-any.whl (25 kB)
Collecting fluidsynth
  Using cached fluidsynth-0.2.tar.gz (3.7 kB)
Collecting midi2audio
  Using cached midi2audio-0.1.1-py2.py3-none-any.whl (8.7 kB)
Collecting spacy<3
  Using cached spacy-2.3.5-cp38-cp38-win_amd64.whl (9.7 MB)
Collecting fastcore<1.4,>=1.3.8
  Using cached fastcore-1.3.19-py3-none-any.whl (53 kB)
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement torchvision<0.9,>=0.8 (from fastai) (from versions: 0.1.6, 0.1.7, 0.1.8, 0.1.9, 0.2.0, 0.2.1, 0.2.2, 0.2.2.post2, 0.2.2.post3, 0.5.0, 0.9.0, 0.9.1)
ERROR: No matching distribution found for torchvision<0.9,>=0.8 (from fastai)


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
os.chdir('../../')

In [3]:
from musicautobot.numpy_encode import *
from musicautobot.config import *
from musicautobot.music_transformer import *
from musicautobot.utils.midifile import *
from musicautobot.utils.file_processing import process_all

## Preprocessing Large Dataset

This Notebook is specifically for preprocessing and encoding a large dataset. It uses multi-processing and handles encoding errors

***

**Note** This is the same training same code as `Train.ipynb`. Only thing that is different is a more robust pre-processing step

In [19]:
# Location of your midi files
midi_path = r'D:\VIP\musicRepo2\musicautobot\data\midi'
# Location of preprocessed numpy files
numpy_path = Path('data/numpy/lm')

# Location of models and cached dataset
data_path = Path('data/cached')
data_save_name = 'reddit_musicitem_data_save.pkl'

#[p.mkdir(parents=True, exist_ok=True) for p in [midi_path, numpy_path, data_path]];

## 1. Gather midi dataset

Make sure all your midi data is in `musicautobot/data/midi` directory

Here's a pretty good dataset with lots of midi data:  
https://www.reddit.com/r/datasets/comments/3akhxy/the_largest_midi_collection_on_the_internet/

1. Download the folder and unzip it to `data/midi`

2. Rename `130000_Pop_...` to `reddit`

## 2. Create dataset from MIDI files

In [20]:
# num_tracks = [1, 2] # number of tracks to support
cutoff = 5 # max instruments
min_variation = 3 # minimum number of different midi notes played
# max_dur = 128

In [21]:
midi_files = get_files(midi_path, '.mid', recurse=True); len(midi_files)

42781

In [22]:
print(midi_path)

D:\VIP\musicRepo2\musicautobot\data\midi


In [23]:
def process_metadata(midi_file):
    # Get outfile and check if it exists
    out_file = numpy_path/midi_file.relative_to(midi_path).with_suffix('.npy')
    out_file.parent.mkdir(parents=True, exist_ok=True)
    if out_file.exists(): return
    
    npenc = transform_midi(midi_file)
    if npenc is not None: np.save(out_file, npenc)

In [24]:
def transform_midi(midi_file):
    input_path = midi_file
    
    # Part 1: Filter out midi tracks (drums, repetitive instruments, etc.)
    try: 
#         if duet_only and num_piano_tracks(input_path) not in [1, 2]: return None
        input_file = compress_midi_file(input_path, min_variation=min_variation, cutoff=cutoff) # remove non note tracks and standardize instruments
        
        if input_file is None: return None
    except Exception as e:
        if 'badly form' in str(e): return None # ignore badly formatted midi errors
        if 'out of range' in str(e): return None # ignore badly formatted midi errors
        print('Error parsing midi', input_path, e)
        return None
        
    # Part 2. Compress rests and long notes
    stream = file2stream(input_file) # 1.
    try:
        chordarr = stream2chordarr(stream) # 2. max_dur = quarter_len * sample_freq (4). 128 = 8 bars
    except Exception as e:
        print('Could not encode to chordarr:', input_path, e)
        print(traceback.format_exc())
        return None
    
    # Part 3. Compress song rests - Don't want songs with really long pauses 
    # (this happens because we filter out midi tracks).
    chord_trim = trim_chordarr_rests(chordarr)
    chord_short = shorten_chordarr_rests(chord_trim)
    delta_trim = chord_trim.shape[0] - chord_short.shape[0]
#     if delta_trim > 500: 
#         print(f'Removed {delta_trim} rests from {input_path}. Skipping song')
#         return None
    chordarr = chord_short
    
    # Part 3. Chord array to numpy
    npenc = chordarr2npenc(chordarr)
    if not is_valid_npenc(npenc, input_path=input_path):
        return None
    
    return npenc

In [25]:
# # sanity check
import random
for r in random.sample(midi_files, 10):
    process_metadata(r)

In [26]:
def timeout_func(data, seconds):
    print("Timeout:", seconds, data.get('midi'))

In [27]:
processed = process_all(process_metadata, midi_files, timeout=120, timeout_func=timeout_func)

RuntimeError: Unexpected error within the Pool

In [12]:
def create_databunch(files, data_save_name, path=data_path):
    save_file = path/data_save_name
    if save_file.exists():
        data = load_data(path, data_save_name)
    else:
        save_file.parent.mkdir(exist_ok=True, parents=True)
        vocab = MusicVocab.create()
        processors = [OpenNPFileProcessor(), MusicItemProcessor()]

        data = MusicDataBunch.from_files(files, path, processors=processors, encode_position=True)
        data.save(data_save_name)
    return data

## Create Data Bunch

In [13]:
numpy_files = get_files(numpy_path, extensions='.npy', recurse=True); len(numpy_files)

616

In [14]:
all_data = create_databunch(numpy_files, data_save_name=data_save_name); all_data

## 3. Load Model

In [15]:
batch_size = 1
encode_position = True
dl_tfms = [batch_position_tfm] if encode_position else []
data = load_data(data_path, data_save_name, bs=batch_size, encode_position=encode_position, dl_tfms=dl_tfms)

In [16]:
config = default_config()
config['encode_position'] = encode_position
learn = music_model_learner(data, config=config.copy())

## 4. Train

In [17]:
learn.fit_one_cycle(4)

epoch,train_loss,valid_loss,accuracy,time


KeyboardInterrupt: 

In [None]:
learn.save('example')

## 5. Predict

---
See [Generate.ipynb](Generate.ipynb) to use a pretrained model and generate better predictions

---

In [11]:
midi_file = Path('data/midi/notebook_examples/single_bar_example.mid'); midi_file
item = MusicItem.from_file(midi_file, data.vocab);

In [None]:
item.show()

Here's what the seed sounds like:

### Start Predictions:

In [16]:
pred, full = learn.predict(item, n_words=100)

Prediction

In [None]:
# Prediction
pred.show()

In [None]:
pred.play()