# Extract data

## Config etc

In [1]:
from music21 import converter
from collections import Counter
import sys
sys.path.append('../../')
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import glob
import pandas as pd
import h5py
import seaborn as sns

In [2]:
# The directory with all catafolk data
CATAFOLK_DIR = '/Users/Bas/surfdrive/catafolk'

# The directory with all Tsimané transcriptions
TSIMANE_DIR = '/Users/Bas/repos/papers/tsimane/transcriptions/musicxml'

# The main data file
DATA_STORE = '../data/test.hdf5'
# DATA_STORE = '../data/data.hdf5'

In [3]:
# Build index of Tsimané files
tsimane_index = pd.DataFrame()
tsimane_index['local_path'] = glob.glob(f'{TSIMANE_DIR}/*.musicxml')
tsimane_index['index'] = ['tsimane-'+os.path.basename(p).split('.')[0] for p in tsimane_index['local_path']]

## Code

In [4]:
def get_step_curve(filename, include_repetitions=False):
    stream = converter.parse(filename)
    part_lengths = [len(part.flat.notes) for part in stream.parts]
    part_unique_notes = [len({n.pitch.ps for n in part.flat.notes}) for part in stream.parts]
    part = stream.parts[np.argmax(part_unique_notes)]
    notes = [part.flat.notes[0]]
    for note in part.flat.notes:
        if note.pitch.ps != notes[-1].pitch.ps or include_repetitions:
            notes.append(note)
    pitches = [n.pitch.ps for n in notes]
    onsets = [float(n.offset) for n in notes]
    return np.array([onsets, pitches])

# sioux_files = sorted(glob.glob(patterns['sioux']))
# ts, ps = get_step_curve(sioux_files[0], include_repetitions=True)
# plt.plot(ts, ps, '.-')
# ts, ps = get_step_curve(sioux_files[0])
# plt.plot(ts, ps, '.-')

In [5]:
def process_files(filenames, repeats=False, min_length=3):
    entries = []
    for filename in filenames:
        try:
            ts, ps = get_step_curve(filename, include_repetitions=repeats)
            if len(ts) <= min_length: continue
            entries.append(dict(
                onsets = np.array(ts),
                pitches = np.array(ps),
                intervals = np.array(ps[1:] - ps[:-1])
            ))
        except Exception as e:
            print(filename, e)
    return entries

In [6]:
def load_corpus_index(corpus, catafolk_dir=CATAFOLK_DIR):
    corpus_dir = glob.glob(f'{catafolk_dir}/{corpus}/**')[0]
    if os.path.isfile(corpus_dir):
        corpus_dir = os.path.dirname(corpus_dir)
    index = pd.read_csv(f'{corpus_dir}/index.csv', index_col=0)
    col = 'path' if 'path' in index.columns else 'file_path'
    index['local_path'] = [f'{corpus_dir}/{fp}' for fp in index[col]]
    return index

In [7]:
def generate_corpus_data(corpus, refresh=False, repeats=False):
    with h5py.File(DATA_STORE, "a") as file:
        root = f"{corpus}/{'with' if repeats else 'without'}-repeats/"

        if root in file and not refresh: 
            print('>  Skipping', corpus)
            return
        if root in file:
            del file[root]
        
        if corpus == 'tsimane':
            index = tsimane_index
        else:
            index = load_corpus_index(corpus)
        
        all_pitches = []
        all_onsets = []
        entries = process_files(index.local_path, repeats=repeats)
        for id, entry in zip(index.index, entries):
            pitches = entry['pitches']
            onsets = entry['onsets']
            
            # Store in database
            file.create_dataset(f"{root}/entries/{id}/onsets", data=onsets)
            file.create_dataset(f"{root}/entries/{id}/pitches", data=pitches)
            
            # Append to seq of all pitches/onsets; mark end of the song by NAN
            all_pitches.extend(pitches)
            all_onsets.extend(onsets)
            all_pitches.append(np.nan)
            all_onsets.append(np.nan)

        file.create_dataset(f"{root}/onsets", data=np.array(all_onsets))
        file.create_dataset(f"{root}/pitches", data=np.array(all_pitches))

# generate_corpus_data('creighton-nova-scotia', refresh=True)
# generate_corpus_data('creighton-nova-scotia', refresh=True, repeats=True)

In [8]:
def remove_corpus(corpus):
    with h5py.File("data.hdf5", "a") as file:
        if corpus in file:
            del file[corpus]

# remove_corpus('densmore-papago')

## Extract data

In [9]:
kern_corpora = [
    # 'boehme-altdeutsches-liederbuch',
    # 'boehme-volksthumliche-lieder',
    'creighton-nova-scotia',
    # 'densmore-choctaw',
    # 'densmore-maidu',
    # 'densmore-menominee',
    # 'densmore-nootka',
    # 'densmore-northern-ute',
    # 'densmore-ojibway',
    
    # 'densmore-pawnee',
    # 'densmore-pueblo',
    # 'densmore-teton-sioux',
    # 'erk-deutscher-liederhort',
    # 'essen-china-han',
    # 'essen-china-natmin',
    # 'essen-china-shanxi',
    # 'essen-china-xinhua',
    # 'pinck-verklingende-weisen',
    # 'sagrillo-ireland',
    # 'sagrillo-luxembourg',
    # 'haydn-scottish-songs',
    # #
    # # 'bronson-child-ballads',
    # # 'densmore-papago',
    # # 'natural-history-of-song',
    # # 'finnish-folk-tunes',
]

In [10]:
generate_corpus_data('tsimane', repeats=True)
generate_corpus_data('tsimane', repeats=False)

In [11]:
for corpus in kern_corpora:
    print(corpus)
    generate_corpus_data(corpus, repeats=True)
    generate_corpus_data(corpus, repeats=False)

creighton-nova-scotia
