## Helpers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy.io


In [2]:
def report_intervals(intervals, title):
    pre = '>  '
    line = '—'*30
    print(f'{title}\n{line}')
    print(f'{pre}Length      {len(intervals)}')
    print(f'{pre}NA count    {np.isnan(intervals).sum()}')
    print(f'{pre}Minimum     {np.min(intervals):.2e}s')
    print(f'{pre}Mean        {np.mean(intervals):.2e}s')
    print(f'{pre}Maximum     {np.max(intervals):.2e}s')
    print(f'{line}\n')

In [3]:
def save_intervals(intervals, group, name, report=False):
    if report: report_intervals(intervals, f'Saving {group}/{name}')
    path = os.path.join('intervals', group, f'{name}.txt')
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    np.savetxt(path, intervals)

In [4]:
def drop_na(x):
    return x[np.logical_not(np.isnan(x))]

## IEMP Cuban Son and Salsa (IEMP-CSS) — Poole, Tarsitani & Clayton

**Download** the `annotations` directory from https://osf.io/sfxa2/ to `data/iemp-css`

The `IEMP-CSS` corpus is a collection of Cuban son and salsa recordings by the group Asere, a Havana based group of seven musicians. The original recordings were made in 2009 by Adrian Poole, and are publicly available as part of the Interpersonal Entrainment in Music Performance (IEMP) corpus (https://osf.io/37fws/). Five songs were recorded:

In [5]:
CSS_metadata = pd.DataFrame([
    dict(num=1, id='Song_1', title='El Cantante', genre='Son', clave='2:3 son', duration="7\'15", stereo_mix='https://osf.io/tzhfd'),
    dict(num=2, id='Song_2', title='Palo Santo', genre='Son', clave='3:2 son', duration="5\'58", stereo_mix='https://osf.io/z9uxs'),
    dict(num=3, id='Song_3', title='Yo Naci En Un Solar', genre='Rumba/Salsa', clave='3:2 rumba', duration="7\'45", stereo_mix='https://osf.io/syu7b'),
    dict(num=4, id='Song_4', title='Tumbao Sangreao', genre='Son', clave='2:3 son', duration="5\'20", stereo_mix='https://osf.io/cezr7'),
    dict(num=5, id='Song_5', title='Habanera', genre='Son', clave='2:3 son', duration="6\'53", stereo_mix='https://osf.io/fe6bs')
]).set_index('id')
CSS_metadata

Unnamed: 0_level_0,num,title,genre,clave,duration,stereo_mix
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Song_1,1,El Cantante,Son,2:3 son,7'15,https://osf.io/tzhfd
Song_2,2,Palo Santo,Son,3:2 son,5'58,https://osf.io/z9uxs
Song_3,3,Yo Naci En Un Solar,Rumba/Salsa,3:2 rumba,7'45,https://osf.io/syu7b
Song_4,4,Tumbao Sangreao,Son,2:3 son,5'20,https://osf.io/cezr7
Song_5,5,Habanera,Son,2:3 son,6'53,https://osf.io/fe6bs


For each song,(see below), three annotations are provided: **metre annotations** "based on manually tapping the first beat of each cycle", **raw onset data** and **selected onset data**, "tabulated with metrical positions and other information". We use the raw onset data, which are split per instrument: "onset times for most instruments (vocals, crash cymbal and shakers were excluded) were extracted based on envelope characteristics using MIR Toolbox".

> Each CSS_[Recording]_Onsets_Raw.csv file contains the  raw onset data for all 
> instruments. In some cases these have been edited, so e.g. when the bongo is 
> played for half a track and the bell for the second half, only raw onsets from 
> the relevant time range are included in each column. Also, onset extraction of 
> the bell on song 5 uses different settings for different time ranges [...]
> Columns alternate between times and peak levels, e.g. Bass = time of bass 
> guitar onset, Bass peak = corresponding peak level

In [6]:
# Example: first onsets in Song 1 (it starts with a spoken introduction)
df = pd.read_csv('data/iemp-css/Song_1/CSS_Song1_Onsets_Raw.csv')
onsets = df[[c for c in df.columns if not c.endswith('peak')]]
onsets.head()

Unnamed: 0,Bass,Clave,Guitar,Tres,Bongos,Bell,Cajon,Conga,Trumpet
0,42.05369,278.341708,43.45051,43.615348,65.688889,276.846238,40.449487,41.874961,65.972955
1,42.296115,278.842562,46.462723,46.442251,65.879865,277.028109,41.178212,42.746861,66.41885
2,42.535768,279.049702,49.274543,49.349568,66.104993,277.2032,41.358699,43.269066,66.871378
3,42.975715,279.57851,51.229845,50.983572,67.356844,277.289063,41.875127,44.703581,68.298741
4,43.247475,279.771593,51.710341,51.172598,67.559424,277.426148,42.571815,45.199161,68.518859


In [7]:
# Some instruments of course have fewer onsets:
onsets.tail()

Unnamed: 0,Bass,Clave,Guitar,Tres,Bongos,Bell,Cajon,Conga,Trumpet
1897,,,,,,,,431.554631,
1898,,,,,,,,431.733755,
1899,,,,,,,,432.199869,
1900,,,,,,,,433.13735,
1901,,,,,,,,433.270826,


In [24]:
def get_surface_intervals(onsets_df, min_diff=25/1000):
    """Combine onsets from all instruments, but discard those that are practically simultaneous"""
    surface = drop_na(np.msort(np.ravel(onsets_df.values)))
    not_simultaneous = np.where(surface[1:] > surface[:-1] + min_diff)[0] + 1
    not_simultaneous = np.r_[0, not_simultaneous]
    surface = surface[not_simultaneous]
    intervals = surface[1:] - surface[:-1]
    assert np.min(intervals) > min_diff
    return intervals

In [25]:
def IEMPCSS_extract_intervals_from_song(song_num):
    df = pd.read_csv(f'data/iemp-css/Song_{song_num}/CSS_Song{song_num}_Onsets_Raw.csv')
    onsets_df = df[[c for c in df.columns if not c.endswith('peak')]]

    group = f'iemp_css/song_{song_num}'
    base = f'iemp_css-song_{song_num}'

    # Store instruments separately
    concatenated = []
    onsets_df = df[[c for c in df.columns if not c.endswith('peak')]]
    for instrument in onsets_df.columns:
        onsets = onsets_df[instrument].dropna().values
        intervals = onsets[1:] - onsets[:-1]
        concatenated.extend(intervals.tolist())
        concatenated.append(np.nan)
        save_intervals(intervals, group, f'{base}-{instrument.lower()}')

    # Concatenate all instruments
    save_intervals(np.array(concatenated), group, f'{base}-concatenated')

    # Surface rhythm: all onsets of all instrument combined
    surface = get_surface_intervals(onsets_df)
    surface = np.r_[surface, np.nan]
    save_intervals(surface, group, f'{base}-surface')

for song_num in range(1, 6):
    IEMPCSS_extract_intervals_from_song(song_num)

In [30]:
# combine all songs
concatenated = []
surface = []
for song_num in range(1, 6):
    base = f'intervals/iemp_css/song_{song_num}/iemp_css-song_{song_num}'
    concat = np.loadtxt(f'{base}-concatenated.txt')
    surf = np.loadtxt(f'{base}-surface.txt')
    assert np.isnan(concat[-1]) and np.isnan(surf[-1])
    concatenated.extend(concat.tolist())
    surface.extend(surf.tolist())

save_intervals(np.array(concatenated), 'iemp_css', 'iemp_css-concatenated')
save_intervals(np.array(surface), 'iemp_css', 'iemp_css-surface')

## Roeske et al

**Instructions**: Download all the supplementary data from https://data.mendeley.com/datasets/s4cjj7h5sv/2 and move all files to a `data/roeske/` directory

**Note:** most of the text below is copy-pasted from the above data repository or other parts of their supplementary materials.

| Style | onsets | pieces/performances | solo/multi-instrumental |
|---|---|---|---|
| North Indian Raga | 86,822 | 8 | multi |
| Cuban Salsa | 38,056 | 40 | multi |
| Uruguayan Candombe | 47,998 | 39 | multi |
| Malian Jembe | 42,195 | 46 | multi |
| Tunisian Stambeli | 27,451 | 9 | multi |
| Persian Zarb | 32,416 | 64 | solo |
| Western piano / Bach | 260,653 | 156 | solo |

"The musical corpora include performances in which several instruments played simultaneously. In these cases, we analyzed each instrument separately, and aggregated the resulting two-interval ratios across instruments as a second step. For comparison, we also present instrument-combined interval data. We obtained these by extracting onsets from each instrument individually, but then combining these onsets."

"Overall the two types of extraction (separate for each instrument and simultaneous for combined ‘‘surface’’ rhythm) produced similar results (see Figure S1)."
*(BC: Well, if you compare them in S1, they are quite different: in 3 out of 5 corpora they note strong differences)*

*BC: It's not entirely clear to me how they combined/aggregated ratios across instruments. The paper suggests that the surface rhythm: all instruments overlaid, mixed if you like. And then I guess the sseparate version just concatenates all instruments?*

### Cuban Salsa

**`IntervalData_CubanSalsa_corpus.mat`** contains two variables: 
1. `CubanSalsa_intervals_ms`: vector of onset-onset intervals, instruments separated;``
2. `CubanSalsa_INSTRUMENTSCOMBINED_intervals_ms`: vector of onset-onset intervals for all instruments combined (for details, see file "METHODS4IntervalData_processing of musical corpora.pdf")

In [31]:
salsa = scipy.io.loadmat(f'data/roeske/IntervalData_CubanSalsa_corpus.mat')
save_intervals(salsa['CubanSalsa_intervals_ms'][:, 0] / 1000, 'roeske', 'cuban_salsa')
save_intervals(salsa['CubanSalsa_INSTRUMENTSCOMBINED_intervals_ms'][:, 0] / 1000, 'roeske', 'cuban_salsa_combined',)

In [None]:
2. "IntervalData_MalianJembe_corpus.mat": 
Malian Jembe, multi-instrument music, from IEMP corpus (https://osf.io/y5jxm/). 
Two variables: 
"MalianJembe_intervals_ms": onset-onset intervals, instruments separated;
"MalianJembe_INSTRUMENTSCOMBINED_intervals_ms": onset-onset intervals, instruments combined

3. "IntervalData_NorthIndianRaga_corpus.mat": 
North Indian (Hindustani) Raga, multi-instrument, original recordings from the IEMP corpus (https://osf.io/3cmg4/). 
Two variables: 
"NorthIndianRaga_intervals_ms": onset-onset intervals, instruments separated;
"NorthIndianRaga_INSTRUMENTSCOMBINED_intervals_ms": onset-onset intervals, instruments combined. 

4. "IntervalData_TunisianStambeli_corpus.mat": 
Tunisian Stambeli, multi-instrument music, original recordings from the IEMP corpus (https://osf.io/qaxdy/). 
Two variables: 
"TunisianStambeli_intervals_ms": onset-onset intervals, instruments separated;
"TunisianStambeli_INSTRUMENTSCOMBINED_intervals_ms": onset-onset intervals, instruments combined.

5. "IntervalData_UruguayanCandombe_corpus.mat": 
Uruguayan Candombe, multi-instrument music, original recordings from the IEMP corpus (https://osf.io/4q9g7/). 
Two variables: 
"UruguayanCandombe_intervals_ms": onset-onset intervals, instruments separated;
"UruguayanCandombe_INSTRUMENTSCOMBINED_intervals_ms": onset-onset intervals, instruments combined (see above).

6. “IntervalData_PianoBach_corpus.mat”: 
Western piano (Bach) solo performances, from MAESTRO dataset (https://arxiv.org/abs/1810.12247). 
One variable:
"PianoBach_intervals_ms": onset-onset intervals.

7. “IntervalData_PersianZarb_corpus.mat”
Zarb solo playing by 5 performers from commercial recordings.
One variable:
"PersianZarb_intervals_ms": vector of onset-onset intervals.

8. Folder "MatlabCode_RhythmVisualization": 
"plotRatioHist.m" plots histogram of dyadic interval ratios,
"plot_FlowerShapedRasterPlot.m" plots sorted raster plots of dyadic interval pairs.

In [None]:
# Music corpora
corpora = [
    'CubanSalsa',
    'MalianJembe',
    'NorthIndianRaga',
    'PersianZarb',
    'PianoBach',
    'TunisianStambeli',
    'UruguayanCandombe'
]
for corpus in corpora:
    mat = scipy.io.loadmat(f'data/IntervalData_{corpus}_corpus.mat')a
    
    DATA[corpus] = drop_na(mat[f'{corpus}_intervals_ms'][:,0])