## TODO
 - Handle repeated chords (*)
 - Analyze sparcity of chord qualities and extensions
     - if too sparce, then ignore extensions and simplify quality
 - 

## Import modules

In [51]:
import numpy as np
import pandas as pd

## Define global variables

In [52]:
# dictionary of chord (key) to numeral (value)
CHORD_DICT = {
    'A':0,
    'A#':1,
    'Bb':1,
    'B':2,
    'Cb':2,
    'C':3,
    'C#':4,
    'Db':4,
    'D':5,
    'D#':6,
    'Eb':6,
    'E':7,
    'Fb':7,
    'F':8,
    'F#':9,
    'Gb':8,
    'G':10,
    'G#':11,
    'Ab':11,
}

In [96]:
# absolute path to McGill Billboard data
mcgill_path = "/Users/bradenyang/biof509/final-project/data/McGill-Billboard"

## Define Data Parser Functions

In [95]:
def convert_numeral(chord,tonic):
    """
    Function convert_numeral converts a chord (string) into a
    numeric representation (int) that is relative to the tonic key (string)
    
    For example, if chord = "F" and tonic = "Bb", then the output would be
    the integer 7
    -----------------------
    Input
        chord: string of the chord to be converted to a numeral
        tonic: string of the tonic key
    Output
        numeral: int representation of "chord"
    """
    numeral = CHORD_DICT[chord] - CHORD_DICT[tonic]
    if numeral < 0:
        return numeral + 12
    else:
        return numeral

def parse_data(path):
    """
    Funtion parse_data parses text file provided from the McGill Billboard
    project and returns two lists of lists containing a sequence of chords
    and their qualities respectively
    -----------------------
    Input
        path: string containing absolute path to text file to parse
    
    Output
        all_chord: list containing chord sequences at each line of the text
            file
        all_quality: list containing corresponding chord qualities for the
            chords appearing in "all_chord"
    """
    # initialize global lists
    all_chord = []
    all_quality = []
    # iterate over every line
    for line in open(path):
        # set new tonic, if applicable
        if "# tonic:" in line:
            tonic = line.rsplit(" ")[-1][:-1]
        # initialize line-specific lists
        chord_line = []
        quality_line = []
        # split bars (separated by "|")
        bars = line.rsplit("|")[1:-1]
        # iterate over bars
        for bar in bars:
            # split chords+quality (separated by space)
            cqs = bar.rsplit(" ")[1:-1]
            # iterate over chords+quality
            for cq in cqs:
                # check for existence of ":"
                if ":" in cq:
                    # get chord, convert to relative numeral and append
                    chord_line.append(convert_numeral(cq.rsplit(":")[0],tonic))
                    # get quality and append
                    quality_line.append(cq.rsplit(":")[-1])
        # append line list to global list (ignore empty lines)
        if chord_line:
            all_chord.append(chord_line)
            all_quality.append(quality_line)

    return all_chord, all_quality

## Load in CSV file of metadata

In [69]:
# load CSV
songs_df = pd.read_csv("data/billboard-2.0-index.csv");

In [70]:
# filter out rows that have NaN in the "actual_rank" column
songs_filter_df = songs_df[songs_df.actual_rank.notnull()]

In [71]:
# get song indices
song_id = songs_filter_df.id.as_matrix()

## Parse all text files

In [104]:
# initialize dicts to keep track of global counts
num_chord = {}
num_quality = {}

# iterate through all songs
for song in song_id:
    # parse song
    all_chord, all_quality = parse_data(mcgill_path + f"/{str(song).zfill(4)}/salami_chords.txt")
    # iterate through lines in a song
    for (chord_line,quality_line) in zip(all_chord,all_quality):
        # iterate through individual chords
        for (chord,quality) in zip(chord_line,quality_line):
            # populate dicts
            if chord in num_chord:
                num_chord[chord] += 1
            else:
                num_chord[chord] = 1
                
            if quality in num_quality:
                num_quality[quality] += 1
            else:
                num_quality[quality] = 1

In [106]:
num_chord

{0: 42440,
 1: 765,
 2: 7006,
 3: 2603,
 4: 3641,
 5: 20280,
 6: 653,
 7: 18341,
 8: 3362,
 9: 6810,
 10: 5778,
 11: 628}

## Scratch

In [88]:
my_dict = {}

In [90]:
my_dict["foo"] = 1

In [91]:
my_dict["foo"] += 1

In [98]:
str(1).zfill(4)
zip()

'0001'

## Trash

In [76]:
# def parse_data(path):
#     # initialize global lists
#     all_chord = []
#     all_quality = []
#     # read chord file line by line
#     with open(path) as fp:
#         # iterate over every line
#         for line in fp:
#             # set new tonic, if applicable
#             if "# tonic:" in line:
#                 tonic = CHORD_DICT[line.rsplit(" ")[-1][:-1]]
#             # initialize line-specific lists
#             chord_line = []
#             quality_line = []
#             # find all indices where there is a "|"
#             idx_bar = [pos for pos, char in enumerate(line) if char == "|"]
#             # iterate over bars
#             for i in range(len(idx_bar)-1):
#                 # select a bar from the whole line
#                 bar = line[idx_bar[i]+1:idx_bar[i+1]]
#                 # find all indices where there is a ":"
#                 idx_chord = [pos for pos, char in enumerate(bar) if char == ":"]
#                 # iterate over chords in a bar
#                 for j in idx_chord:
#                     # grab chord (convert to numeral relative to tonic)
#                     chord = ""
#                     offset = 1
#                     my_char = bar[j-offset]
#                     while my_char != " ":
#                         chord += my_char
#                         offset += 1
#                         my_char = bar[j-offset]
#                     chord_num = CHORD_DICT[chord[::-1]]
#                     chord_rel = chord_num - tonic
#                     if chord_rel < 0:
#                         chord_line.append(chord_rel + 12)
#                     else:
#                         chord_line.append(chord_rel)

#                     # grab chord quality
#                     quality = ""
#                     offset = 1
#                     my_char = bar[j+offset]
#                     while my_char != " ":
#                         quality += my_char
#                         offset += 1
#                         my_char = bar[j+offset]
#                     quality_line.append(quality)
#             # append to global list
#             if chord_line:
#                 all_chord.append(chord_line)
#             if quality_line:
#                 all_quality.append(quality_line)
    
#     return all_chord, all_quality