In [17]:
%load_ext autoreload
%autoreload 2
from jh import *
from tools.ms3 import Score # MuseScore 3 parser

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Configuration

In [23]:
compute_anew = False # if False, the computed data is loaded from dataframes
data_tsv = 'data/tsv/'
data_ms3 = 'data/MuseScore_3'

# Parsing the data set

The MuseScore 3 parser is based on the `Beautifulsoup` library. It is evoked like this:

    score = Score('filename.mscx')
    
* **ll. 1303-23** First, it stores the references to the most important structural nodes such as &lt;staff&gt; and &lt;measure&gt;,  creates the measure counts (`mc`) starting at 0, and performs some basic checks. 
* **ll. 1325-1444** Then, the most important structural content of all measure nodes is stored in one dataframe per staff (dict `self.mc_info`) to then be merged into the final **measure list** called `self.info`.
* **ll. 1447-52** From there, measure numbers as displayed in MuseScore can be calculated,
* **ll. 1455-1538** as well as the section structure which I have determined by repeat signs and double bar lines: Internally, for every section the parser creates and stores a `Section` object containing an individual note list and other information.
* **ll. 1540-1668** Finally, the measure list is enriched with section information and error checks are performed.

Afterwards, the measure list can be directly accessed as the property `score.info`, the section objects can be addressed through the dictionary `score.sections` and - most importantly - note lists are conveniently obtained using the `score.get_notes()` interface.

## Loading the file list

In [10]:
merged_ids = os.path.join(data_ms3, 'merged_ids.tsv')
files = pd.read_csv(merged_ids, sep='\t', index_col=0)
files.head(3)

Unnamed: 0_level_0,D,no,dance,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,41,1,menuett,041/D041menuett01a.mscx
2,41,1,trio,041/D041trio01b.mscx
3,41,2,menuett,041/D041menuett02a.mscx


## Parsing the files and concatenating their measure lists and note lists

In [24]:
if compute_anew:
    # store the parsed scores in memory
    parse_score = lambda path: Score(os.path.join(data_ms3, path))
    score_objects = pd.DataFrame(files.path.apply(parse_score))\
                    .rename(columns={'path': 'object'})
    # use get_notes() on every object
    note_list = score_objects.groupby('id').apply(lambda df: df.iloc[0,0].get_notes(volta_warning=False))
    note_list.to_csv(os.path.join(data_tsv, 'note_list_complete.tsv'), sep='\t')
    # retrieve measure list from every object
    measure_list = score_objects.groupby('id').apply(lambda df: df.iloc[0,0].info)\
                                .astype({'section': int, 
                                         'keysig': int, 
                                         'voices': int, 
                                         'volta': 'Int64', 
                                         'numbering_offset': 'Int64', 
                                         'dont_count': 'Int64'})
    measure_list.to_csv(os.path.join(data_tsv, 'measure_list_complete.tsv'), sep='\t')
    # retrieve section order from every object, e.g. [0,0,1,1] in the case of two repeated sections
    section_order = score_objects.applymap(lambda x: x.section_order).rename(columns = {'path': 'sections'})
    section_order.to_csv(os.path.join(data_tsv, 'section_order_complete.tsv'), sep='\t')
else:
    note_list = read_note_list(os.path.join(data_tsv, 'note_list_complete.tsv'), index_col=[0,1,2])
    measure_list = read_measure_list(os.path.join(data_tsv, 'measure_list_complete.tsv'), index_col=[0,1])
    section_order = pd.read_csv(os.path.join(data_tsv, 'section_order_complete.tsv'), sep='\t', index_col = [0])\
                      .rename(columns={'object': 'sections'})               
note_list.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mc,mn,onset,duration,gracenote,nominal_duration,scalar,tied,tpc,midi,staff,voice,volta
id,section,ix,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,0,0,0,0,3/16,,1/8,3/2,,3,57,1,1,
1,0,1,0,0,0,3/16,,1/8,3/2,,-1,65,1,1,
1,0,2,0,0,3/16,1/16,,1/16,1,,3,57,1,1,


In [21]:
measure_list.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,section,keysig,timesig,duration,act_dur,offset,voices,repeats,volta,barline,numbering_offset,dont_count,mn,next
id,mc,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,0,-1,3/4,3/4,1/4,1/2,2,firstMeasure,,,,1.0,0,[1]
1,1,0,-1,3/4,3/4,3/4,0,3,,,,,,1,[2]
1,2,0,-1,3/4,3/4,3/4,0,2,,,,,,2,[3]


In [25]:
section_order.head(3)

Unnamed: 0_level_0,sections
id,Unnamed: 1_level_1
1,"[0, 0, 1, 1]"
2,"[0, 0, 1, 1]"
3,"[0, 0, 1, 1]"


In [2]:
if compute_anew:
    bass = apply_to_pieces(bass_per_beat, transposed[transposed.staff==2], resolution=1/8)
    bass = apply_to_pieces(add_chord_boundaries, bass, measure_list, next_ids='segment_id', multiple_pieces=True)
    bass.to_csv(os.path.join(data_tsv,'schubert_bass.tsv'), sep='\t')
    schubert_segments =  apply_to_pieces(all_chord_notes, bass, transposed, by='segment_id', multiple_pieces=True)
    schubert_segments.to_csv(os.path.join(data_tsv,'schubert_segmented.tsv'), sep='\t')
    segment_features = schubert_segments.groupby(level=['id','segment_id']).apply(summarize_ints)
    segment_features = pd.DataFrame(segment_features).unstack().droplevel(0, axis=1)
    segment_features.to_csv(os.path.join(data_tsv,'schubert_segment_features.tsv'), sep='\t')
else:
    transposed = read_note_list(os.path.join(data_tsv,'transposed_schubert.tsv'), index_col=[0,1,2], converters={'beatsize': frac, 'subbeat': frac})
    bass = read_note_list(os.path.join(data_tsv,'schubert_bass.tsv'), index_col=[0,1,2,3], dtypes={'next_id': 'Int64'}, converters={'beatsize': frac, 'subbeat': frac, 'onset_next': frac, })
    schubert_segments = read_note_list(os.path.join(data_tsv,'schubert_segmented.tsv'), index_col=[0,1,2,3])
    segment_features = pd.read_csv(os.path.join(data_tsv,'schubert_segment_features.tsv'), sep='\t', index_col=[0,1],
                             converters={'intervals': lambda t: tuple(i.strip("\',") for i in t.strip("() ").split(", ") if i != '')})

  interactivity=interactivity, compiler=compiler, result=result)


## Onset patterns

In [3]:
df = transposed
df = df[(df.volta != 1) & (df.mn != 0)]
right = df[df.staff == 1]
left = df[df.staff == 2]
n_measures_per_piece = df.groupby(['id']).mn.nunique()


onset_patterns = df.groupby(['id', 'mn']).apply(os_pattern, )
pattern_list = get_pattern_list(onset_patterns, occurring_in_min=3)
onset_patterns_left = left.groupby(['id', 'mn']).apply(os_pattern)
pattern_list_left = get_pattern_list(onset_patterns_left, occurring_in_min=3)
onset_patterns_right = right.groupby(['id', 'mn']).apply(os_pattern)
pattern_list_right = get_pattern_list(onset_patterns_right, occurring_in_min=3)

In [7]:
create_os_features(onset_patterns[onset_patterns.isin(pattern_list.index)], n_measures_per_piece)\
    .to_csv(os.path.join(data, 'os_patterns.tsv'), sep='\t')
create_os_features(onset_patterns_left[onset_patterns_left.isin(pattern_list_left.index)], n_measures_per_piece)\
    .to_csv(os.path.join(data, 'os_patterns_left.tsv'), sep='\t')
create_os_features(onset_patterns_right[onset_patterns_right.isin(pattern_list_right.index)], n_measures_per_piece)\
    .to_csv(os.path.join(data, 'os_patterns_right.tsv'), sep='\t')

### Most stereotypical piece in double meter

In [None]:
dm_pieces = note_list[mask & double_meter]
n_measures_per_dm_piece = dm_pieces.groupby(['id']).mn.nunique()
osp_double = note_list[mask & double_meter].groupby(['id', 'mn']).apply(jh.os_pattern)
osp_double_r = note_list[mask & double_meter & right].groupby(['id', 'mn']).apply(jh.os_pattern)
osp_double_l = note_list[mask & double_meter & left].groupby(['id', 'mn']).apply(jh.os_pattern)
df0 = jh.get_pattern_list(osp_double, n_most_frequent=15, normalize=True)
df1 = jh.get_pattern_list(osp_double_r, n_most_frequent=15, normalize=True)
df2 = jh.get_pattern_list(osp_double_l, n_most_frequent=15, normalize=True)
display_side_by_side({'overall': df0,'right': df1, 'left': df2})

In [None]:
typical_dm_right = osp_double_r[osp_double_r == 'TitiTiti'].groupby('id').count()
(typical_dm_right / n_measures_per_dm_piece.loc[typical_dm_right.index])\
    .sort_values(ascending=False)\
    .iloc[:10]

In [None]:
typical_dm_left = osp_double_l[osp_double_l == 'TaTa'].groupby('id').count()
(typical_dm_left / n_measures_per_dm_piece.loc[typical_dm_left.index])\
    .loc[[i for i in [232, 135, 426, 279, 240, 424, 241, 257, 91, 136] if i in typical_dm_left]]\
    .sort_values(ascending=False)

### Most stereotypical piece in triple meter

In [None]:
tm_pieces = note_list[mask & triple_meter]
n_measures_per_tm_piece = tm_pieces.groupby(['id']).mn.nunique()
osp_triple = note_list[mask & triple_meter].groupby(['id', 'mn']).apply(jh.os_pattern)
osp_triple_r = note_list[mask & triple_meter & right].groupby(['id', 'mn']).apply(jh.os_pattern)
osp_triple_l = note_list[mask & triple_meter & left].groupby(['id', 'mn']).apply(jh.os_pattern)
df0 = jh.get_pattern_list(osp_triple, n_most_frequent=15, normalize=True)
df1 = jh.get_pattern_list(osp_triple_r, n_most_frequent=15, normalize=True)
df2 = jh.get_pattern_list(osp_triple_l, n_most_frequent=15, normalize=True)
display_side_by_side({'overall': df0,'right': df1, 'left': df2})

In [None]:
typical_tm_right = osp_triple_r[osp_triple_r == 'TitiTitiTiti'].groupby('id').count()
(typical_tm_right / n_measures_per_tm_piece.loc[typical_tm_right.index]).sort_values(ascending=False).iloc[:3]

In [None]:
typical_tm_left = osp_triple_l[osp_triple_l == 'TaTaTa'].groupby('id').count()
(typical_tm_left / n_measures_per_tm_piece.loc[typical_tm_left.index])\
    .sort_values(ascending=False)\
    .loc[[i for i in [60, 163, 269] if i in typical_tm_left]]

In [None]:
segment_features = apply_to_pieces(add_previous_ix, segment_features)
segment_features = add_previous_vals(segment_features)
segment_features.intervals.value_counts().iloc[:50]
segment_features.loc[segment_features.intervals == ('P1', 'm3')].dropna(axis=1)
segment_features['label'] = np.nan

type2label = {
('M3', 'P5'):       lambda r: name2rn(r.bass),
('M2', 'M3', 'P5'):       lambda r: name2rn(r.bass), ###
('M6', 'P4'):       lambda r: f"V(64)/{tpc2rn(name2tpc(r.bass) - 1)}" if r.bass != 'G' else 'V(64)',
('P5', 'm3'):       lambda r: name2rn(r.bass).lower(),
('M3', 'P5', 'm7'): lambda r: f"V7/{tpc2rn(name2tpc(r.bass) - 1)}" if r.bass != 'G' else 'V7',
('M2', 'M3', 'P5', 'm7'): lambda r: f"V7/{tpc2rn(name2tpc(r.bass) - 1)}" if r.bass != 'G' else 'V7', ###
('M3', 'M6', 'P5', 'm7'): lambda r: f"V7/{tpc2rn(name2tpc(r.bass) - 1)}" if r.bass != 'G' else 'V7', ###
('M3', 'm7'):       lambda r: f"V7/{tpc2rn(name2tpc(r.bass) - 1)}" if r.bass != 'G' else 'V7',
('M2', 'M3', 'm7'):       lambda r: f"V7/{tpc2rn(name2tpc(r.bass) - 1)}" if r.bass != 'G' else 'V7',
('m3', 'm6'):       lambda r: f"{tpc2rn(name2tpc(r.bass)-4)}6",
('P4', 'm6'):       lambda r: f"V(64)/{tpc2rn(name2tpc(r.bass) - 1).lower()}" if r.bass != 'G' else 'V(64)',
#('m6'): lambda r: f"{tpc2rn(name2tpc(r.bass)-4)}6",
('M3', 'M6'):       lambda r: f"{tpc2rn(name2tpc(r.bass)-4).lower()}6",
#('M6'): lambda r: f"{tpc2rn(name2tpc(r.bass)-4).lower()}6",
('D5', 'm3', 'm6'): lambda r: f"V65/{tpc2rn(name2tpc(r.bass) - 4)}" if r.bass != 'B' else 'V65',
('D5', 'm6'):       lambda r: f"V65/{tpc2rn(name2tpc(r.bass) - 4)}" if r.bass != 'B' else 'V65',
('M6', 'P4', 'm3'): lambda r: f"V43/{tpc2rn(name2tpc(r.bass) - 2)}" if r.bass != 'D' else 'V43',
('P4', 'm3'):       lambda r: f"V43/{tpc2rn(name2tpc(r.bass) - 2)}" if r.bass != 'D' else 'V43',
('A4', 'M2', 'M6'): lambda r: f"V2/{tpc2rn(name2tpc(r.bass) + 1)}" if r.bass != 'F' else 'V2',
('A4', 'M2'):       lambda r: f"V2/{tpc2rn(name2tpc(r.bass) + 1)}" if r.bass != 'F' else 'V2',
('M3', 'M6', 'P5'): lambda r: f"ii65/{tpc2rn(name2tpc(r.bass) + 1)}" if r.bass != 'F' else 'ii65',
('M2', 'M7', 'P4', 'P5'): lambda r: f"{tpc2rn(r.bass)}(742)",
('D5', 'D7', 'm3'): lambda r: f"viio7/{tpc2rn(name2tpc(r.bass) - 5)}" if r.bass != 'B' else 'viio7',
('M6', 'm3'):       lambda r: f"viio6/{tpc2rn(name2tpc(r.bass) - 2)}" if r.bass != 'D' else 'viio6',
}

def hard_labeling(segment_features):
    for ints, f in type2label.items():
        segment_features.loc[segment_features.intervals == ints, 'label'] = segment_features[segment_features.intervals == ints].apply(f, axis=1)
hard_labeling(segment_features)
schubert_segments.loc[1].iloc[:50]
status = schubert_segments.join(segment_features[['bass', 'intervals', 'label', 'prev_ints', 'chord_length', 'offbeat']], on=['id', 'segment_id'])
status[status.intervals == ('M2', 'M6', 'P4')].prev_ints.value_counts().iloc[:50]
status[status.intervals == ('M3',)].chord_length.value_counts().iloc[:50]
status[(status.intervals == ('M3',)) & status.offbeat].prev_ints.value_counts().iloc[:50]

underspecified = {
tuple(): None,
('M3',): None,
('P5',): None,
('m3',): None,
('P5', 'm7'): None,
('m7',): None,
('P4',): None,
('M2',): None,
('M6', 'P4', 'P5'): None,
('M7',): None,
('D5', 'm6'): None,
}



