## TODO
 - Convert chord letters to numerals based on home key
     - handle key changes?
 - Handle enharmonics
 - Parse through all songs that have data (refer to the CSV)

## Import modules

In [18]:
import numpy as np
import pandas as pd

## Define global variables

In [49]:
# dictionary of chord (key) to numeral (value)
CHORD_DICT = {
    'A':0,
    'A#':1,
    'Bb':1,
    'B':2,
    'Cb':2,
    'C':3,
    'C#':4,
    'Db':4,
    'D':5,
    'D#':6,
    'Eb':6,
    'E':7,
    'Fb':7,
    'F':8,
    'F#':9,
    'Gb':8,
    'G':10,
    'G#':11,
    'Ab':11,
}

## Define Data Parser Function

In [78]:
def parse_data(path):
    # initialize global lists
    all_chord = []
    all_quality = []
    # read chord file line by line
    with open(path) as fp:
        # iterate over every line
        for line in fp:
            # set new tonic, if applicable
            if "# tonic:" in line:
                tonic = CHORD_DICT[line.rsplit(" ")[-1][:-1]]
                print(tonic)
            # initialize line-specific lists
            chord_line = []
            quality_line = []
            # find all indices where there is a "|"
            idx_bar = [pos for pos, char in enumerate(line) if char == "|"]
            # iterate over bars
            for i in range(len(idx_bar)-1):
                # select a bar from the whole line
                bar = line[idx_bar[i]+1:idx_bar[i+1]]
                # find all indices where there is a ":"
                idx_chord = [pos for pos, char in enumerate(bar) if char == ":"]
                # iterate over chords in a bar
                for j in idx_chord:
                    # grab chord (convert to numeral relative to tonic)
                    chord = ""
                    offset = 1
                    my_char = bar[j-offset]
                    while my_char != " ":
                        chord += my_char
                        offset += 1
                        my_char = bar[j-offset]
                    chord_num = CHORD_DICT[chord[::-1]]
                    chord_rel = chord_num - tonic
                    if chord_rel < 0:
                        chord_line.append(chord_rel + 12)
                    else:
                        chord_line.append(chord_rel)

                    # grab chord quality
                    quality = ""
                    offset = 1
                    my_char = bar[j+offset]
                    while my_char != " ":
                        quality += my_char
                        offset += 1
                        my_char = bar[j+offset]
                    quality_line.append(quality)
            # append to global list
            if chord_line:
                all_chord.append(chord_line)
            if quality_line:
                all_quality.append(quality_line)
    
    return all_chord, all_quality

In [None]:
# a more elegant parser using built-in str functions such as rsplit
def parse_data_temp(path):

## Load in CSV file of all songs

In [4]:
# load CSV
songs_df = pd.read_csv("data/billboard-2.0-index.csv");

In [13]:
# filter out rows that have NaN in the "actual_rank" column
songs_filter_df = songs_df[songs_df.actual_rank.notnull()]

In [23]:
# get song indices
song_id = songs_filter_df.id.as_matrix()

## Make integer keys for all unique chords/qualities

In [26]:
all_chord = []
all_quality = []
# iterate through all songs
for song in song_id:
    chord, quality = parse_data(f"/Users/bradenyang/biof509/final-project/data/McGill-Billboard/{str(song).zfill(4)}/salami_chords.txt")
    for i in range(len(chord)):
        all_chord += chord[i]
        all_quality += quality[i]

chord_unique = set(all_chord)
quality_unique = set(all_quality)

## Scratch

In [21]:
my_str = "123.887596371	| G:maj | G:maj | C:maj | C:maj |, voice)"

In [28]:
idx = [pos for pos, char in enumerate(my_str) if char == "|"]

In [34]:
my_str[idx[0]+1:idx[1]]

2

In [43]:
foo = ""

In [45]:
foo += "s"

In [46]:
foo

's'

In [41]:
foo.append("t")

In [58]:
my_str = " A:sus4(b7,9) "

In [64]:
x = my_str.rsplit(":")

In [66]:
x[-1]

'sus4(b7,9) '

In [62]:
" 3 4 6 7".strip()

'3 4 6 7'

In [79]:
chord, quality = parse_data(f"/Users/bradenyang/biof509/final-project/data/McGill-Billboard/0260/salami_chords.txt")

5
1
5


In [77]:
chord

[[2, 2, 2, 2],
 [2, 2, 0, 0],
 [2, 2, 0, 0],
 [2, 7, 0, 0],
 [2, 7, 9, 9],
 [2, 2, 0, 0],
 [2, 2, 0, 0],
 [2, 7, 0, 0],
 [2, 7, 9, 9],
 [0, 7, 5, 0, 0, 7, 5, 0],
 [0, 7, 5, 0, 0, 7, 5],
 [9, 11, 11],
 [2, 7, 0, 0],
 [2, 7, 9, 9],
 [2, 7, 0, 0],
 [2, 7, 9, 9],
 [2, 7, 0, 0],
 [2, 7, 0, 0],
 [2, 7, 9, 9]]

In [74]:
print(my_str[:-1])

D
