# Data generation: contour extraction

We extract contours of entire songs, phrases and motifs from multiple collections of symbolic music. 
The original collections should be located in the 'datasets' directory, which is excluded from this repository. The datasets are named by their id in [Catafolk](https://bacor.github.io/catafolk/).

In [7]:
from music21 import converter
import chant21
import sys
sys.path.append('../src')
from melodic_contour import Contour, stream_to_contour

import numpy as np
import pandas as pd
import scipy
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.style.use('styles.mplstyle')

import os
import glob

## Songs

In [2]:
def extract_song_contour(filename, num_samples=100):
    """Extract the contour of a complete song from a kern file"""
    stream = converter.parse(filename)
    contour = stream_to_contour(stream)
    pitches = contour.interpolate(num_samples=num_samples).pitches
    return pitches, contour.duration

extract_song_contour('../datasets/densmore-choctaw/data/choct01.krn')

(array([61., 58., 58., 61., 61., 61., 58., 61., 58., 56., 57., 56., 56.,
        56., 56., 56., 56., 56., 56., 56., 56., 63., 58., 58., 61., 61.,
        58., 61., 58., 56., 56., 58., 58., 68., 56., 56., 56., 56., 56.,
        56., 56., 61., 58., 58., 61., 61., 61., 58., 58., 56., 56., 56.,
        58., 56., 56., 56., 56., 56., 56., 56., 56., 56., 58., 58., 61.,
        61., 58., 56., 56., 56., 58., 58., 56., 56., 56., 56., 56., 56.,
        56., 56., 58., 58., 61., 61., 58., 56., 56., 56., 58., 58., 56.,
        56., 56., 56., 56., 56., 56., 56., 56., 56.]),
 61.0)

In [3]:
def get_song_contours(dataset_dir, max_songs=-1, num_samples=100):
    """Extract all contours from songs in a (Catafolk) dataset"""
    meta_fn = os.path.join(dataset_dir, 'index.csv')
    if os.path.exists(meta_fn):
        meta = pd.read_csv(meta_fn, index_col=0)
        if 'file_path' in meta.columns:
            filenames = meta['file_path'][:max_songs]
        else:
            filenames = meta['path'][:max_songs]
        paths = filenames.map(lambda p: os.path.join(dataset_dir, p))
    else:
        paths = (glob.glob(f'{dataset_dir}/**/*.krn')
                 + glob.glob(f'{dataset_dir}/*.krn')
                 + glob.glob(f'{dataset_dir}/**/*.gabc'))[:max_songs]
        ids = [os.path.basename(os.path.splitext(p)[0]) for p in paths]
        paths = {idx: p for idx, p in zip(ids, paths)}
        
    contours = []
    durations = []
    ids = []
    for idx, path in paths.items():
        try:
            contour, duration = extract_song_contour(path, num_samples=num_samples)
            ids.append(idx)
            durations.append([duration])
            contours.append(contour.astype(int))
        except:
            print(f'Error: could not extract contour from {path}')

    df = pd.DataFrame(np.concatenate([durations, contours], axis=1))
    df.columns = ['duration'] + list(range(num_samples))
    df.index = ids
    return df

get_song_contours(f'../datasets/han/', max_songs=3)

Unnamed: 0,duration,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
han0037,16.0,79.0,79.0,79.0,79.0,74.0,74.0,74.0,72.0,72.0,...,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0
han0751,24.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,71.0,71.0,...,66.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0,64.0
han0989,44.0,76.0,76.0,76.0,78.0,76.0,73.0,71.0,69.0,66.0,...,71.0,71.0,69.0,66.0,66.0,64.0,64.0,64.0,64.0,64.0


In [4]:
datasets_dir = '../datasets'
datasets = [
    'densmore-teton-sioux',
    'densmore-menominee',
    'densmore-nootka',
    'densmore-northern-ute',
    'densmore-maidu',
    'densmore-pawnee',
    'densmore-pueblo',
    'densmore-papago',
    'densmore-ojibway',
    'densmore-choctaw',
    'boehme-altdeutsches-liederbuch',
    'han',
    'shanxi',
    'natmin',
    'erk',
    'gregobase',
]

In [5]:
refresh = False
for dataset in datasets:
    output_fn = f'../data/song-contours/{dataset}.csv'
    if not os.path.exists(output_fn) or refresh:
        print(f'Starting with {dataset}')
        df = get_song_contours(
            os.path.join(datasets_dir, dataset),
            max_songs=-1, num_samples=100)
        df.to_csv(output_fn)
    else:
        print(f'Skpping {dataset}: already exists')

Starting with densmore-teton-sioux
Error: could not extract contour from ../datasets/densmore-teton-sioux/data/kern/sioux175.krn
Starting with densmore-menominee
Starting with densmore-nootka
Starting with densmore-northern-ute
Starting with densmore-maidu
Starting with densmore-pawnee
Starting with densmore-pueblo
Starting with densmore-papago
Starting with densmore-ojibway
Starting with densmore-choctaw
Error: could not extract contour from ../datasets/densmore-choctaw/data/choct02.krn
Starting with boehme-altdeutsches-liederbuch
Error: could not extract contour from ../datasets/boehme-altdeutsches-liederbuch/data/deut3939.krn
Error: could not extract contour from ../datasets/boehme-altdeutsches-liederbuch/data/deut4016.krn
Starting with han
Error: could not extract contour from ../datasets/han/han0586.krn
Error: could not extract contour from ../datasets/han/han0953.krn
Starting with shanxi
Starting with natmin
Starting with erk
Starting with gregobase
Error: could not extract conto

AxisError: axis 1 is out of bounds for array of dimension 1

## Phrases

Phrases are extracted using the code base in the `src` directory (similar to what was used in the [DLfM'20 paper](https://github.com/bacor/DLfM2020)). Running the `generate_data.sh` script regenerates all the phrase data.

## Motifs


We extract motifs from Gregorian chant. 
We use the exact same data that was used in our ISMIR2020 paper on mode classification.
This data can be downloaded from https://github.com/bacor/ISMIR2020/tree/master/data/run-0; please put it in `../datasets/cornelissen-etal-2020-run0/`

In [6]:
_VOLPIANO_TO_MIDI = {
    "8": 53, # F
    "9": 55, # G
    "a": 57,
    "y": 58, # B flat
    "b": 59,
    "c": 60,
    "d": 62,
    "w": 63, # E flat
    "e": 64,
    "f": 65,
    "g": 67,
    "h": 69,
    "i": 70, # B flat
    "j": 71,
    "k": 72, # C
    "l": 74,
    "x": 75, # E flat
    "m": 76,
    "n": 77,
    "o": 79,
    "p": 81,
    "z": 82, # B flat
    "q": 83, # B
    "r": 84, # C
    "s": 86,
    
    # Liquescents
    "(": 53,
    ")": 55,
    "A": 57,
    "B": 59,
    "C": 60,
    "D": 62,
    "E": 64,
    "F": 65,
    "G": 67,
    "H": 69,
    "J": 71,
    "K": 72, # C
    "L": 74,
    "M": 76,
    "N": 77,
    "O": 79,
    "P": 81,
    "Q": 83,
    "R": 84, # C
    "S": 86, # D
    
    # Naturals
    "Y": 59, # Natural at B
    "W": 64, # Natural at E
    "I": 71, # Natural at B
    "X": 76, # Natural at E
    "Z": 83,
}

def volpiano_to_midi(volpiano, fill_na=False, skip_accidentals=False):
    """
    Translates volpiano pitches to a list of midi pitches

    All non-note characters are ignored or filled with `None`, if `fill_na=True`
    Unless `skip_accidentals=True`, accidentals are converted to midi pitches
    as well. So an i (flat at the B) becomes 70, a B flat. Or a W (a natural at
    the E) becomes 64 (E).
    """
    accidentals = 'iwxyz' + 'IWXYZ'
    midi = []
    for char in volpiano:
        if skip_accidentals and char in accidentals:
            pass
        elif char in _VOLPIANO_TO_MIDI:
            midi.append(_VOLPIANO_TO_MIDI[char])
        elif fill_na:
            midi.append(None)
    return midi

In [7]:
def volpiano_to_contour(volpiano, num_samples=100):
    pitches = volpiano_to_midi(volpiano + volpiano[-1])
    xs = np.linspace(0, 1, len(pitches))
    func = scipy.interpolate.interp1d(xs, pitches, kind='previous')
    return func(np.linspace(0, 1, num_samples))

In [8]:
def extract_volpiano_contours(df, num_samples=100, normalize=True):
    contours = []
    motif_nums = []
    durations = []
    chant_ids = []
    
    for chant_id, volpiano in df.items():
        segments = volpiano.split()
        for i, segment in enumerate(segments):
            contour = volpiano_to_contour(segment, num_samples=num_samples)
            contours.append(contour.astype(int))
            chant_ids.append([chant_id])
            motif_nums.append([i+1])
            durations.append([len(segment)])
            
    df = pd.DataFrame(
        np.concatenate([chant_ids, motif_nums, durations, contours], axis=1))
    df.columns = ['chant_id', 'motif_num', 'duration'] + list(range(num_samples))
    return df
                   
# extract_volpiano_contours(pitches['syllables'][:200])

In [9]:
chants = pd.read_csv('../datasets/cornelissen-etal-2020-run-0/responsory/subset/train-chants.csv', index_col=0)
pitches = pd.read_csv('../datasets/cornelissen-etal-2020-run-0/responsory/subset/train-representation-pitch.csv', index_col=0)

In [10]:
for segmentation in ['neumes', 'syllables', 'words']:
    output_fn = f'../data/motif-contours/responsory-subset-{segmentation}.csv'
    if not os.path.exists(output_fn):
        print(f'Working on {segmentation}')
        df = extract_volpiano_contours(pitches[segmentation])
        df.to_csv(output_fn)
    else:
        print(f'Already exists: {segmentation}')

Already exists: neumes
Already exists: syllables
Already exists: words


## Random walks

### Poisson, contour-like

In [11]:
def random_contour(lam, num_samples=50, n=10, p=0.5):
    length = max(3, np.random.poisson(lam=lam))
    contour = [np.random.randint(60,85)]
    for i in range(1, length):
        step = np.random.binomial(n, p) - n*p
        step = min(max(-12, step), 12)
        if (contour[i-1] + step > 84) or (contour[i-1] + step < 60):
            contour.append(contour[i-1] - step)
        else:
            contour.append(contour[i-1] + step)
            
    contour.append(contour[-1])
    c = Contour(contour)
    return c.interpolate(num_samples).pitches, length

In [12]:
def generate_random_phrase_contours(lam, num_contours=5000, num_samples=100):
    contours = []
    durations = []
    for i in range(num_contours):
        contour, duration = random_contour(lam, num_samples=num_samples)
        contours.append(contour.astype(int))
        durations.append([duration])
        
    df = pd.DataFrame(np.concatenate([durations, contours], axis=1))
    df.columns = ['duration'] + list(range(num_samples))
    return df

# generate_random_phrase_contours(lam_phrases, num_contours=10)

In [13]:
np.random.seed(0)
lam_phrases = 12 
df = generate_random_phrase_contours(lam_phrases, num_contours=5000)
df.to_csv(f'../data/random-contours/random-lam-{lam_phrases}.csv')

In [14]:
np.random.seed(1)
df = generate_random_phrase_contours(50, num_contours=5000)
df.to_csv('../data/random-contours/random-lam-50.csv')

### Plain random walks

In [15]:
def uniform_random_walk(length=50, max_step=12, num_samples=100):
    contour = [np.random.randint(-max_step,max_step+1)]
    for i in range(1, length):
        step = np.random.randint(-max_step, max_step+1)
        contour.append(contour[i-1] + step)
        
    contour.append(contour[-1])
    c = Contour(contour)
    return c.interpolate(num_samples).pitches

In [16]:
def generate_random_walk_contours(num_contours=5000, length=50, **kwargs):
    contours = []
    for i in range(num_contours):
        contour = uniform_random_walk(length=length, **kwargs)
        contours.append(contour.astype(int))

    durations = np.array([[length] * num_contours]).T
    df = pd.DataFrame(contours)
    df['duration'] = length
    return df

# generate_random_walk_contours(num_contours=10)

In [17]:
np.random.seed(2)
df = generate_random_walk_contours(num_contours=5000, length=25)
df.to_csv(f'../data/random-contours/random-walk-25.csv')