**YOU CAN VIEW THIS NOTEBOOK INCLUDING THE GRAPHICS USING THIS LINK:** https://nbviewer.jupyter.org/github/DCMLab/schubert_dances/blob/master/schubert_project.ipynb

# Prerequisites
This is a tested setup using a new conda environment but of course you can install everything in your root/base environment.

## Update your managers

    conda update conda
    conda update conda-build
    python -m pip --upgrade pip
    
### And make sure to have ipykernel installed in all environments of which you might want to use the kernel
    
## (Either) Create new environment with required packages
Here with the arbitrary name `schubert`.

    conda create -n schubert python=3.7 nb_conda_kernels jupyter
    conda activate schubert
    conda install -c plotly plotly-orca psutil requests
    python -m pip install cufflinks Beautifulsoup4 lxml scipy
    
## (Or) Install them in an existing environment
    conda install nb_conda_kernels jupyter
    conda install -c plotly plotly-orca psutil requests
    python -m pip install cufflinks Beautifulsoup4 lxml scipy
    
## (Optional) If you need Jupyter Lab

So far the notebook should run in Jupyter Notebook, but if, in addition, you want to use **Jupyter Lab**, you will need to follow these instructions taken from https://plot.ly/python/getting-started/#jupyterlab-support-python-35:

Install via `pip`:

    python -m pip install jupyterlab==1.2 ipywidgets>=7.5
    
or `conda:`

    conda install -c conda-forge jupyterlab=1.2
    conda install "ipywidgets=7.5"
    
Set system variable to avoid "JavaScript heap out of memory" errors during extension installation:

    # (OS X/Linux)
    export NODE_OPTIONS=--max-old-space-size=4096
    # (Windows)
    set NODE_OPTIONS=--max-old-space-size=4096

Then, run the following commands:

    jupyter labextension install @jupyter-widgets/jupyterlab-manager@1.1 --no-build
    jupyter labextension install jupyterlab-plotly@1.3.0 --no-build
    jupyter labextension install plotlywidget@1.3.0 --no-build
    jupyter lab build

Then you can unset the system variable again:

    # (OS X/Linux)
    unset NODE_OPTIONS
    # (Windows)
    set NODE_OPTIONS=

In [13]:
%load_ext autoreload
%autoreload 2
from tools.ms3 import *
import scipy as sp
from plotly.offline import iplot
import cufflinks as cf # for creating plots from pandas on the fly
import plotly.io as IO
cf.go_offline()
cf.set_config_file(theme='ggplot') #{'solar', 'pearl', 'white', 'ggplot'}

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Configuration 

In [14]:
compute_all = False                              # Sets all 'compute_anew' flags to True
logging.getLogger().setLevel(logging.WARNING)    # logging level
data ='data'                                     # data folders
data_ms3 = os.path.join(data,'MuseScore_3')
data_tsv = os.path.join(data,'tsv')
note_features = None                             # TODO: Pass selected features as kwargs to get_notes()
#plt.rcParams['figure.figsize'] = [15,10]        # Change the plotsize for matplotlib in the entire notebook
std_plotsize = (1100, 500)                       # Standard plotsize (in px) for storing plotly images to disk 
                                                 # (the ones in the notebook adapt to windows size)

# Helper functions

In [15]:
def plot(df, fname='test.png', kind='scatter', size=std_plotsize,  **kwargs):
    fig = df.iplot(asFigure=True,kind=kind, **kwargs)
    #fig.update(layout={})
    w, h = size[0], size[1]
    IO.write_image(fig, fname, width=w, height=h)
    iplot(fig)

In [16]:
def bag_of_notes(df, tpc='tpc'):
    """Input: DataFrame including the columns ['tpc', 'duration']"""
    tpcs = df.tpc
    occurring = np.sort(tpcs.unique())
    bag = pd.DataFrame(index=occurring, columns=['count_a', 'count_n', 'duration_a', 'duration_n'])
    GB = df.groupby('tpc')
    bag.count_a = GB.size()
    bag.count_n = bag.count_a / bag.count_a.sum()
    bag.duration_a = GB['duration'].sum().astype(float)
    bag.duration_n = (bag.duration_a / bag.duration_a.sum()).astype(float)
    if tpc != 'tpc':
        names = tpc.split('+')
        note_names = []
        for n in names:
            if n == 'tpc':
                note_names.append(occurring)
            elif n == 'name':
                note_names.append(tpc2name(occurring))
            elif n == 'degree':
                note_names.append(tpc2degree(occurring))
            elif n == 'pc':
                note_names.append(tpc2pc(occurring))
            else:
                logging.warning("Parameter tpc can only be {'tpc', 'name', 'degree', 'pc'} or a combination such as 'tpc+pc' or 'name+degree+tpc'.")
        L = len(note_names)
        if L == 0:
            note_names.append(bag.index)
            L = 1
        if L == 1:
            bag.index = note_names[0]
        else:
            bag.index = [f"{t[0]} ({', '.join(str(e) for e in t[1:])})" for t in zip(*note_names)]
    return bag

# Loading the data
The dataframe `merged_ids` maps the 435 IDs to the files in the `data_ms3` folder. The different types of trios (trio I, trio II, alternative trio) are all replaced by the term 'trio'.

In [17]:
merged_ids = os.path.join(data_ms3, 'merged_ids.tsv')
files = pd.read_csv(merged_ids, sep='\t', index_col=0)
files.loc[files.dance.str.contains('trio'), 'dance'] = 'trio'
files.head()

Unnamed: 0_level_0,D,no,dance,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,41,1,menuett,041/D041menuett01a.mscx
2,41,1,trio,041/D041trio01b.mscx
3,41,2,menuett,041/D041menuett02a.mscx
4,41,2,trio,041/D041trio02b.mscx
5,41,3,menuett,041/D041menuett03a.mscx


#### Accessibility
We check whether all files are accessible.

In [18]:
missing = [file for file in files.path.values if not os.path.isfile(os.path.join(data_ms3, file))]
if missing != []:
    print("These files are missing:\n" + '\n'.join(missing))
else:
    print("All files found.")

All files found.


## Computing or loading the DataFrame representation of the music
We will be working on the data set in the form of note_lists. These can be newly computed from the scores or loaded from the precomputed TSV.

In [97]:
compute_anew = False

if compute_anew:

    parse_score = lambda path: Score(os.path.join(data_ms3,path))
    score_objects = pd.DataFrame(files.path.apply(parse_score)).rename(columns={'path': 'object'})
    note_list = score_objects.groupby('id').apply(lambda df: df.iloc[0,0].get_notes())
    note_list.to_csv(os.path.join(data_tsv, 'note_list_complete.tsv'), sep='\t')
    measure_list = score_objects.groupby('id').apply(lambda df: df.iloc[0,0].info)\
                                .astype({'section': int, 'keysig': int, 'voices': int, 'volta': 'Int64', 'numbering_offset': 'Int64', 'dont_count': 'Int64'})
    measure_list.to_csv(os.path.join(data_tsv, 'measure_list_complete.tsv'), sep='\t')
    section_order = score_objects.applymap(lambda x: x.section_order).rename(columns = {'path': 'sections'})
    section_order.to_csv(os.path.join(data_tsv, 'section_order_complete.tsv'), sep='\t')
    
else:
    note_list = pd.read_csv(os.path.join(data_tsv, 'note_list_complete.tsv'), sep='\t', index_col=[0,1,2], 
                            dtype={'tied': 'Int64', 'volta': 'Int64'}, 
                            converters={'onset':frac, 'duration':frac, 'nominal_duration':frac, 'scalar':frac})
    measure_list = pd.read_csv(os.path.join(data_tsv, 'measure_list_complete.tsv'), sep='\t', index_col=[0,1], 
                               dtype={'volta': 'Int64', 'numbering_offset': 'Int64', 'dont_count': 'Int64'}, 
                               converters={'timesig': frac, 'duration': frac, 'act_dur': frac, 'offset': frac, 'next': lambda l: l.strip('[]').split(', ')})
    section_order = pd.read_csv(os.path.join(data_tsv, 'section_order_complete.tsv'), sep='\t', index_col = [0])
    note_list.head()

# Basic statistics

### Pieces per dance type

The corpus contains 435 pieces. The different types are distributed as follows:
walzer       132
ländler       78
ecossaise     74
deutscher     71
trio          48
menuett       29
galopp         2
cotillon       1
Name: dance, dtype: int64


The corpus contains 435 pieces. The different types are distributed as follows:
walzer       132
ländler       78
ecossaise     74
deutscher     71
trio          48
menuett       29
galopp         2
cotillon       1
Name: dance, dtype: int64


In [151]:
dance_types = files.dance.value_counts()
print(f"The corpus contains {len(files.index)} pieces. The different types are distributed as follows:{NL}{dance_types}")
dance_types.iplot('bar')

The corpus contains 435 pieces. The different types are distributed as follows:
walzer       132
ländler       78
ecossaise     74
deutscher     71
trio          48
menuett       29
galopp         2
cotillon       1
Name: dance, dtype: int64


### Sections

In [109]:
print('The vast majority of the dances is notated in two repeated sections, AABB.')
section_order.groupby('sections').apply(len).sort_values(ascending = False).iplot('bar')

The vast majority of the dances is notated in two repeated sections, AABB.


### Length of individual sections

In [153]:
print('Most sections are normatively 8 measures long, but there are also several 16 measure-long sections: it could be hypothesised that these sections are internally binary.')
measure_list.groupby(['id', 'section']).apply(len).iplot('hist', bins = 40, histnorm = 'probability')

Most sections are normatively 8 measures long, but there are also several 16 measure-long sections: it could be hypothesised that these sections are internally binary.


### Distribution of key signatures

In [152]:
measure_list.keysig.iplot('hist', yTitle = 'Fraction of bars with given key signature', histnorm = 'probability')

### Number and distribution of notes

In [21]:
print(f"The data set has {len(note_list.index)} note events in total, and {len(note_list.index[note_list.gracenote.isna()])} notes without grace notes.")

The data set has 101810 note events in total, and 101231 notes without grace notes.


In [157]:
tpc_distribution = bag_of_notes(note_list, 'name')

In [158]:
tpc_distribution[['count_a', 'duration_a']].iplot('bar', xTitle='Tonal Pitch Class', yTitle='Absolute note counts<br>Durations (in quarter beats)')

In [159]:
tpc_distribution[['count_n', 'duration_n']].iplot('bar', xTitle='Tonal Pitch Class', yTitle='Normalized note counts & durations')

The duration and count of tonal pitch-classes are analogously distributed over the corpus. This implies that the average duration of each note is roughly independent of its tonal pitch class.

In [174]:
average_duration = pd.DataFrame(tpc_distribution['duration_a']/tpc_distribution['count_a'], columns = ['Average duration'])

average_duration.iplot('bar')

### Transposed to C major / A minor

In [28]:
note_list_transposed = note_list.merge(measure_list['keysig'], on=['id', 'mc'], right_index=True)   # Add a column with the corresponding key signature for every note
note_list_transposed.tpc -= note_list_transposed.keysig                                             # subtract key signature from tonal pitch class (=transposition to C)
midi_transposition = tpc2pc(note_list_transposed.keysig)\
                     .apply(lambda x: x if x <= 6 else x % -12)                                     # convert key signature to pitch class and decide whether MIDIs are shifted downwards (if <= 6) or upwards
up_or_down = (midi_transposition == 6)                                                              # if the shift is 6, the direction of shift depends on the key signature:
midi_transposition.loc[up_or_down] = note_list_transposed[up_or_down].keysig\
                                     .apply(lambda x: 6 if x > 0 else -6)                           # If the key signature is F#, shift downwards, if it's Gb, shift upwards
note_list_transposed.midi -= midi_transposition                                                     # transpose MIDIs
transposed_distribution = bag_of_notes(note_list_transposed, 'degree+name')

In [29]:
transposed_distribution[['count_a', 'duration_a']].iplot('bar', xTitle='Tonal Pitch Class', yTitle='Absolute note counts<br>Durations (in quarter beats)')

In [30]:
transposed_distribution[['count_n', 'duration_n']].iplot('bar', xTitle='Tonal Pitch Class', yTitle='Normalized note counts & durations')

**Comment** After shifting all major pieces to C and all minor pieces to A, the distribution reveals two things:
* As is typical for music where most of the chords are rooted on `I` (*tonic* chords) or on `V` (*dominant* chords), the note of the `V`th degree is even more frequent than the root of the key (`I`) because the note `V` appears in the chord on `I` **and** in the chord on `V`.
* Apparently, the majority of the dances stands in major keys because otherwise we would expect higher ratios for `VI` (the minor root) and `#V` (its leading tone).

# Team Schubert: How plotting works

As you have seen, it is enough to put `.iplot()` behind your dataframe to plot it. First argument is `kind=`.

    transposed_distribution[['count_n', 'duration_n']]\
    .iplot('bar', xTitle='Tonal Pitch Class', yTitle='Normalized note counts & durations')
(This is the only page you need for Cufflinks documentation including examples: https://plot.ly/python/v3/ipython-notebooks/cufflinks/)

If you want to store the file to disk in a format of your choice, you can use my wrapper function `plot`, where `args` are the arguments used for `.iplot()` above:

    plot(df, filename, args)
So, in order to create the same plot as above while storing it as `test.pdf`, you can do:

In [16]:
plot(transposed_distribution[['count_n', 'duration_n']], 'test.pdf', 'bar', xTitle='Tonal Pitch Class', yTitle='Normalized note counts & durations')

#### Or, if you want to have the same code in a more readable form:

In [17]:
df = transposed_distribution[['count_n', 'duration_n']]
kwargs = {'kind': 'bar', 
          'xTitle': 'Tonal Pitch Class', 
          'yTitle':'Normalized note counts & durations'}
plot(df, 'test.pdf', **kwargs)

In [18]:
temp = []
for dance in DANCES.index:
    if dance in failed_parse:
        temp += [np.nan]
    else:
        notes = notes_bag(dance)
        temp += [float(sp.stats.entropy(notes.dropna())/np.log(35))]
    
DANCES['entropy'] = temp

NameError: name 'DANCES' is not defined

In [None]:
DANCES.head(10)

In [None]:
DANCES.entropy.hist();

In [None]:
DANCES.groupby('Type').entropy.mean()

In [None]:
def keysig(n):
    return Score(DANCES.File[n]).mc_info[0].keysig[0]

In [None]:
temp = []
for i in DANCES.index:
    if i in failed_parse:
        temp += [np.nan]
    else:
        temp += [keysig(i)]

DANCES['Keysig'] = temp

In [None]:
sp.stats.pearsonr(DANCES.dropna().entropy, DANCES.dropna().Keysig )

In [None]:
plt.scatter(DANCES.dropna().Keysig, DANCES.dropna().entropy);

In [None]:
def KK(mode, transposition):
    """Krumhansl and Kessler (1982) key profiles for major and minor modes"""
    
    major = [6.20, 2.55, 3.45, 2.85, 4.22, 4.57, 2.67, 5.25, 2.45, 3.35, 2.70, 2.70]
    minor = [6.03, 3.35, 3.67, 5.28, 2.58, 3.55, 2.87, 4.80, 4.35, 2.67, 2.50, 3.42]
    
    #major = list(map(lambda x: x/sum(major), major))
    #minor = list(map(lambda x: x/sum(minor), minor))
    
    if mode == 0:
        return major[-transposition:]+major[:-transposition]
    elif mode == 1:
        return minor[-transposition:]+minor[:-transposition]
    else:
        print('0 = major, 1 = minor')