In [10]:
import numpy as np
import pandas as pd
import re, os, random

# Part 2: Grammar Generation
This stage of the pipeline moves towards generation of "s-expression", which encode little snippets of style from the training data. This information is passed along to create the model.
## Note and Chord Functionality
### Note Definition and Functions
We define these as helper functions for later on.
Note that a note (ahaha) is a string with first character a capital letter from A-G, followed by an option '#' or 'b' for sharp/flat.

In [11]:
# defines numerical values for notes
notes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
note_to_num = dict([[n, i] for i, n in enumerate(notes)])
num_to_note = dict([[v, k] for k, v in note_to_num.items()])
same_note = {'A#':'Bb', 'C#':'Db', 'D#':'Eb', 'F#': 'Gb', 'G#':'Ab'}
print(note_to_num)
print(num_to_note)

{'C': 0, 'Db': 1, 'D': 2, 'Eb': 3, 'E': 4, 'F': 5, 'Gb': 6, 'G': 7, 'Ab': 8, 'A': 9, 'Bb': 10, 'B': 11}
{0: 'C', 1: 'Db', 2: 'D', 3: 'Eb', 4: 'E', 5: 'F', 6: 'Gb', 7: 'G', 8: 'Ab', 9: 'A', 10: 'Bb', 11: 'B'}


In [12]:
# checks if a note is formatted correctly and splits it into its component parts
def split_note(note):
    assert re.fullmatch('[A-G][#|b]?[0-7]', note) is not None, 'Note not formatted correctly.'
    note, octave = note[:-1], int(note[-1])
    if note in same_note:
        note = same_note[note]
    return note, octave

# shifts the note by amount half-steps (possibly negative)
def shift_note(note, amount):
    note, octave = split_note(note)
    new_num = note_to_num[note] + amount
    if new_num > 11:
        octave += 1
    elif new_num < 0:
        octave -= 1
    return num_to_note[(new_num) % 12] + str(octave)

def get_root(chord):
    r = re.findall('^[A-G][#|b]?', chord) 
    assert r is not None, 'Chord does not contain root note'
    return r[0]

# output is positive if note2 is above noteorchord1, 0 if same
def find_note_dist(note_or_chord1, note2, chord=False):
    note1 = '%s0'%get_root(note_or_chord1) if chord else note_or_chord1
    note1, octave1 = split_note(note1)
    note2, octave2 = split_note(note2)
    dist = (octave2 - octave1) * 12 + note_to_num[note2] - note_to_num[note1]
    return dist % 12 if chord else dist

### Chord Dictionary and Type Categorization
Defines the half step numbers for each category for each chord type.

In [13]:
chord_dictionary = {
    "major": {"C": [0, 4, 7], "L": [2, 5, 9, 11]},
    "minor": {"C": [0, 3, 7], "L": [2, 5, 8, 10]},
#     "augmented": {"C": [0, 4, 8], "L": [2]},
    "diminished": {"C": [0, 3, 6, 9] ,  "L": [2]},
    "half-diminished": {"C": [0, 3, 6, 10], "L": [2]},
    "dominant-seventh": {"C": [0, 4, 7, 10], "L": [2, 5]}
}

extension_dictionary = {
    "major": {2: 2, 4: 5, 5: 7, 6: 9, 7: 11, 9: 2, 11: 5},
    "minor": {2: 2, 4: 5, 5: 7, 6: 8, 7: 10, 9: 2, 11: 5}
}

In [14]:
w = pd.read_csv('midi_to_csv/WebsterNight_chords.csv')
set(w['chord'])

{'A7', 'C7', 'D6', 'Dj7', 'E-7', 'Em7b5', 'F#-7', 'Fj7', 'Fo', 'G#m7b5', 'G-7'}

In [15]:
def find_chord_type(chord):    
    if "m7b5" in chord:
        return "half-diminished"
    elif "j7" in chord:
        return "dominant-seventh"
    elif "-7" in chord:
        return "minor" # TODO: figure out what this actually is
    elif "o" in chord:
        return "diminished"
    elif "m" in chord:
        return "minor"
    else:
        return "major"

In [16]:
def find_chord_modifiers(chord, chord_type):
    '''
    Currently this is not-so-good code - fix pls
    '''
    if chord_type not in extension_dictionary or '-' in chord: # bad bad
        return []
    root = get_root(chord)
    chord_desc = chord[len(root):]
    modifiers = [0]
    for c in chord_desc:
        if c in ['b', '#']:
            modifiers.append(c)
        elif c != 'm':
            offset = extension_dictionary[chord_type][int(c)]
            if modifiers[-1] == 'b':
                modifiers[-1] = offset - 1
            elif modifiers[-1] == '#':
                modifiers[-1] = offset + 1
            else:
                modifiers.append(offset)
    return modifiers[1:]

In [17]:
chord = 'Eb7b5'
find_chord_modifiers(chord, find_chord_type(chord))

[11, 6]

## Generating S-expressions
This requires a function to find the maximum and minimum slope between two notes, as well as a function to categorize a note given the chord it is played over.
### Finding Slope Bounds

In [18]:
def find_slope_bounds(lst):
    '''
    lst: list of full notes
    '''
    max_jump, min_jump = 0, 0
    for i in range(len(lst) - 1):
        max_jump = max(max_jump, find_note_dist(lst[i], lst[i+1]))
        min_jump = min(min_jump, find_note_dist(lst[i], lst[i+1]))
    return min_jump, max_jump

### Categorizing a Note

In [19]:
def categorize_note(note, chord, modifiers=True, helpful=0.0):
    root = get_root(chord)
    chord_type = find_chord_type(chord)
    dist = find_note_dist(root, note, True)
    if modifiers and dist in find_chord_modifiers(chord, chord_type):
        return 'C'
#     approach_flag = False
    for note_type, interval_list in chord_dictionary[chord_type].items():
        if dist in interval_list:
            if note_type in ['C', 'L'] and random.random() < helpful:
                return 'H'
            return note_type
#         if (dist + 1) % 12 in interval_list or (dist - 1) % 12 in interval_list:
#             approach_flag = True
    return 'X'

### Generation of S-expression String
S-expressions are in the form `"min_slope max_slope note_string|start_time|duration ..."`

**This is the main function that will be used from this section (calls the other two functions above)**

In [20]:
# Not the prettiest...
def create_s_exp(notes):
    # notes is list of tuples of (note_string, start_time, duration, chord)
    s_exp = ''
    notes_only = [chunk[0] for chunk in notes] # could probably just splice categories array
    categories = [categorize_note(chunk[0], chunk[3]) for chunk in notes]
    for i, (note, start, duration, chord) in enumerate(notes):
        category = categories[i]
        if i < len(notes) - 1 and category == 'X' and categories[i + 1] in ['C', 'L'] \
          and find_note_dist(notes_only[i], notes_only[i + 1]) in [-1, 1]:
            category = 'A'
        catcount[category] += 1
        s_exp += category + '|%.3f|%.3f '%(start % 1, duration)
    return '%d %d '%find_slope_bounds(notes_only) + s_exp.strip()

In [21]:
create_s_exp([('Ab5', 2.4, .25, 'Fm'), ('A4', 2.7, .1, 'C')])

'-11 0 C|0.400|0.250 L|0.700|0.100'

## Featurizing S-expressions
We currently use 6 different functions to featurize our s-expressions.

### 1. Number of Notes
Easy enough - take the length of the list of notes

In [22]:
len = len # :P

### 2. Location of the First Note
The fraction of how far into the measure the first note starts.

In [23]:
loc_first = lambda notes: notes[0][1] % 1

### 3. Total Duration of Rests
$1 - \sum\limits_{note\in measure} \text{duration}(note)$

In [24]:
tot_rests = lambda notes: 1 - sum([note[1] for note in notes])

### 4/5. Average Maximum Slope and Order of Contour
The average magnitude of each rising/falling run and how many times the slope changes direction, respectively.

In [25]:
def slope_process(notes):
    slopes = []
    ascending = True
    curr_max = 0
    last_note = notes[0][0]
    direction_changes = 0
    for term in notes[1:]:
        note = term[0]
        dist = find_note_dist(last_note, note)
        if dist > 0:
            if ascending:
                curr_max = max(curr_max, dist)
            else:
                ascending = True
                slopes.append(curr_max)
                curr_max = dist
                direction_changes += 1
        if dist < 0:
            if not ascending:
                curr_max = max(curr_max, -dist)
            else:
                ascending = False
                slopes.append(curr_max)
                curr_max = -dist
                direction_changes += 1
        last_note = note
    slopes.append(curr_max)
    return np.mean(slopes), direction_changes

avg_max_slope = lambda notes: slope_process(notes)[0]
order_contour = lambda notes: slope_process(notes)[1]

### 6. Consonance
Depends on the category of the notes - a higher value corresponds to more chord tones.

In [26]:
def consonance(s_exp):
    total = 0.0
    measure = s_exp.split(' ')[2:]
    
    weights = {'C': 0.8, 'L': 0.4, 'H': 0.6, 'X': 0.1, 'A': 0.4}
    for term in measure:
        note_info = term.split('|')
        note = note_info[0]
        duration = note_info[2]
        if note in weights:
            total += weights[note] * float(duration)
    return total

### Using all of the above
**This is the main function that will be used from this section (calls the other five functions above)**

In [27]:
def featurize(args):
    feature_funcs = [len, loc_first, tot_rests, avg_max_slope, order_contour, consonance]
    assert len(feature_funcs) == num_features, "Incorrect number of features"
    arg_num = [0, 0, 0, 0, 0, 1]
    features = {}
    for i, func in enumerate(feature_funcs):
        features[str(i)] = func(args[arg_num[i]])
    return features

## Processing a Solo MIDI
Returns two pandas DataFrames representing the s-expressions of this solo and their features.

In [28]:
def process_solo(filename, s_exp, features):
    measure = 0
    curr_s_exp = []
    song = pd.read_csv('%s/%s'%(start_directory, filename))
    for i in range(len(song)):
        curr_note = song.iloc[i]
        if measure != int(curr_note['start_time']):
            s = create_s_exp(curr_s_exp)
            row_s = {'exp': s, 'song_id': song_num, 'song_index': measure}
            s_exp = s_exp.append(row_s, ignore_index=True)
            row_f = featurize([curr_s_exp, s])
            features = features.append(row_f, ignore_index=True)
            curr_s_exp = []
            measure = int(curr_note['start_time'])
        curr_s_exp.append((curr_note['note_name'], curr_note['start_time'], curr_note['duration'], curr_note['chord']))
    # TODO: try to not have this tail of loop - just repeat of chunk of for-loop
    s = create_s_exp(curr_s_exp)
    row_s = {'exp': s, 'song_id': song_num, 'song_index': measure}
    s_exp = s_exp.append(row_s, ignore_index=True)
    row_f = featurize([curr_s_exp, s])
    features = features.append(row_f, ignore_index=True)
    return s_exp, features

## All together now...
It's all come down to this one for loop...

In [29]:
start_directory = 'midi_to_csv'
end_directory = 'test_files'

num_features = 6

In [30]:
catcount = {'C': 0, 'L': 0, 'H': 0, 'A': 0, 'X': 0}

In [31]:
s_exp = pd.DataFrame(columns=['exp', 'song_id', 'song_index'])
features = pd.DataFrame(columns=[str(i) for i in range(num_features)])
song_num = 0
for filename in os.listdir(start_directory):
    if filename.endswith('chords.csv'): # TODO: CHANGE THIS
        s_exp, features = process_solo(filename, s_exp, features)
        song_num += 1

In [32]:
catcount

{'A': 21, 'C': 174, 'H': 0, 'L': 80, 'X': 100}

In [33]:
b = {'A': 63, 'C': 174, 'H': 0, 'L': 80, 'X': 58}

In [34]:
a = {'A': 116, 'C': 170, 'H': 0, 'L': 84, 'X': 5}

In [35]:
s_exp.head()

Unnamed: 0,exp,song_id,song_index
0,0 4 X|0.125|0.155 X|0.290|0.725,0,0
1,0 0 L|0.678|0.340,0,1
2,-3 0 X|0.036|0.174 X|0.255|0.164 C|0.443|0.195...,0,2
3,0 0 C|0.004|0.243,0,3
4,-3 7 A|0.171|0.073 C|0.263|0.130 X|0.408|0.185...,0,4


In [36]:
features.head()

Unnamed: 0,0,1,2,3,4,5
0,2.0,0.125,0.584635,4.0,0.0,0.088
1,1.0,0.678385,-0.678385,0.0,0.0,0.136
2,5.0,0.036458,-11.315104,1.5,1.0,0.3913
3,1.0,0.003906,-2.003906,0.0,0.0,0.1944
4,7.0,0.170573,-31.157552,4.0,2.0,0.5759


In [37]:
s_exp.to_csv('%s/s_exp.csv'%end_directory)
features.to_csv('%s/s_exp_features.csv'%end_directory)