## TODO
 - Handle repeated chords (*)
 - Find out what these chords mean: 1, 5
 - Normalize sequence counts based on the total number of sequences of a specific length (to account for song size)
 - Apply PCA on chord progressions features to reduce dimensionality

## Import modules

In [161]:
import numpy as np
import pandas as pd
import os.path
# import sklearn

## Define global variables

In [164]:
# dictionary of chord (key) to numeral (value)
CHORD_DICT = {
    'A':0,
    'A#':1,
    'Bb':1,
    'B':2,
    'Cb':2,
    'C':3,
    'C#':4,
    'Db':4,
    'D':5,
    'D#':6,
    'Eb':6,
    'E':7,
    'Fb':7,
    'F':8,
    'F#':9,
    'Gb':8,
    'G':10,
    'G#':11,
    'Ab':11,
}

# absolute path to McGill Billboard data
MCGILL_PATH = os.path.expanduser("~") + "/biof509/final-project/data/McGill-Billboard"

# bool to indicate whether to simplify quality of chords
QUALITY_SIMPLIFY = True

# bool to indicate whether to normalize data based on 

## Define Data Parser Functions

In [165]:
def convert_numeral(chord,tonic):
    """
    Function convert_numeral converts a chord (string) into a
    numeric representation (int) that is relative to the tonic key (string)
    
    For example, if chord = "F" and tonic = "Bb", then the output would be
    the integer 7
    -----------------------
    Input
        chord: string of the chord to be converted to a numeral
        tonic: string of the tonic key
    Output
        numeral: int representation of "chord"
    """
    numeral = CHORD_DICT[chord] - CHORD_DICT[tonic]
    if numeral < 0:
        return numeral + 12
    else:
        return numeral

def parse_data(path):
    """
    Funtion parse_data parses text file provided from the McGill Billboard
    project and returns two lists of lists containing a sequence of chords
    and their qualities respectively
    -----------------------
    Input
        path: string containing absolute path to text file to parse
    
    Output
        chords: list of lists; each inner list corresponds to a chord sequence
            in an individual line of the song, and contains 2-element tuples
            that give the relative chord numeral and its quality
    """
    # initialize global list
    chords = []
    # iterate over every line
    for line in open(path):
        # set new tonic, if applicable
        if "# tonic:" in line:
            tonic = line.rsplit(" ")[-1][:-1]
        # initialize line-specific lists
        chords_line = []
        # split bars (separated by "|")
        bars = line.rsplit("|")[1:-1]
        # iterate over bars
        for bar in bars:
            # split chords+quality (separated by space)
            cqs = bar.rsplit(" ")[1:-1]
            # iterate over chords+quality
            for cq in cqs:
                # check for existence of ":"
                if ":" in cq:
                    # get chord and convert to relative numeral
                    c = convert_numeral(cq.rsplit(":")[0],tonic)
                    # get quality
                    q = cq.rsplit(":")[-1]
                    if QUALITY_SIMPLIFY:
                        q = q.split("(")[0]
                        q = q.split("/")[0]
                    # append to line list as tuple
                    chords_line.append((c,q))
        # append line list to global list (ignore empty lines)
        if chords_line:
            chords.append(chords_line)

    return chords

## Define feature extractor functions

In [166]:
def get_chord_seq(chords,seq_len):
    """
    TODO
    -----------------------
    Input
        chords: --
        seq_len: --
    
    Output
        seq_dict: --
    """
    # initialize dict
    seq_dict = {}
    # iterate over lines
    for line in chords:
        # if length of sequence greater than length of line, ignore line
        if seq_len > len(line):
            pass
        else:
            # raster over sequences and store in dict
            for i in range(len(line) - seq_len + 1):
                seq = tuple(line[i:(i+seq_len)]) # get sequence as tuple
                if seq in seq_dict:
                    seq_dict[seq] += 1
                else:
                    seq_dict[seq] = 1
   
    return seq_dict

## Load in CSV file of metadata

In [167]:
# load CSV
songs_df = pd.read_csv("data/billboard-2.0-index.csv");

# filter out rows that have NaN in the "actual_rank" column
songs_filter_df = songs_df[songs_df.actual_rank.notnull()].copy()

# get song indices
song_id = songs_filter_df.id.values

## Parse all text files

In [168]:
# initialize dicts to keep track of global counts
num_cq = {}
num_chord = {}
num_quality = {}

# initialize list of dicts to keep track of song chord sequences
song_seq_all = []

In [169]:
# iterate through all songs
for song in song_id:
    # parse song
    chords = parse_data(MCGILL_PATH + f"/{str(song).zfill(4)}/salami_chords.txt")
    
    # -------- EXTRACT SONG-SPECIFIC CHORD SEQUENCES --------
    # initialize song chord sequence dict
    song_seq = {}
    
    # iterate over different sequence lengths, add to song_seq
    for seq_len in range(1,5):
        dict_temp = get_chord_seq(chords,seq_len)
        song_seq.update(dict_temp)
    
    # append song_seq to song_seq_all
    song_seq_all.append(song_seq)
    
    # ----------------- FIND GLOBAL COUNTS ------------------
    # iterate through lines in a song
    for line in chords:
        # iterate through individual chords
        for chord in line:
            # populate dicts
            if chord in num_cq:
                num_cq[chord] += 1
            else:
                num_cq[chord] = 1
            
            if chord[0] in num_chord:
                num_chord[chord[0]] += 1
            else:
                num_chord[chord[0]] = 1
            
            if chord[1] in num_quality:
                num_quality[chord[1]] += 1
            else:
                num_quality[chord[1]] = 1

In [170]:
# sort key-value pairs by values
cq_sort = sorted(num_cq.items(), key = lambda kv: kv[1], reverse=True)
quality_sort = sorted(num_quality.items(), key = lambda kv: kv[1], reverse=True)
chord_sort = sorted(num_chord.items(), key = lambda kv: kv[1], reverse=True)

In [171]:
# get all unique chord progressions
temp = {}
for song_seq in song_seq_all:
    temp.update(song_seq)
seq_unique = temp.keys()

## Add chord sequence features to pandas dataframe

In [172]:
# iterate over all song chord progressions
for idx, song_seq in enumerate(song_seq_all):
    # iterate over all key/value pairs
    for kv in song_seq.items():
        # check if column exists; if so, populate; if not, add column then populate
        if kv[0] in songs_filter_df:
            songs_filter_df.loc[idx,kv[0]] = kv[1]
        else:
            songs_filter_df[kv[0]] = 0
            songs_filter_df.loc[idx,kv[0]] = kv[1]

## Apply PCA on chord progression features

## Build predictive models

## Scratch

## Trash

In [173]:
# def parse_data(path):
#     # initialize global lists
#     all_chord = []
#     all_quality = []
#     # read chord file line by line
#     with open(path) as fp:
#         # iterate over every line
#         for line in fp:
#             # set new tonic, if applicable
#             if "# tonic:" in line:
#                 tonic = CHORD_DICT[line.rsplit(" ")[-1][:-1]]
#             # initialize line-specific lists
#             chord_line = []
#             quality_line = []
#             # find all indices where there is a "|"
#             idx_bar = [pos for pos, char in enumerate(line) if char == "|"]
#             # iterate over bars
#             for i in range(len(idx_bar)-1):
#                 # select a bar from the whole line
#                 bar = line[idx_bar[i]+1:idx_bar[i+1]]
#                 # find all indices where there is a ":"
#                 idx_chord = [pos for pos, char in enumerate(bar) if char == ":"]
#                 # iterate over chords in a bar
#                 for j in idx_chord:
#                     # grab chord (convert to numeral relative to tonic)
#                     chord = ""
#                     offset = 1
#                     my_char = bar[j-offset]
#                     while my_char != " ":
#                         chord += my_char
#                         offset += 1
#                         my_char = bar[j-offset]
#                     chord_num = CHORD_DICT[chord[::-1]]
#                     chord_rel = chord_num - tonic
#                     if chord_rel < 0:
#                         chord_line.append(chord_rel + 12)
#                     else:
#                         chord_line.append(chord_rel)

#                     # grab chord quality
#                     quality = ""
#                     offset = 1
#                     my_char = bar[j+offset]
#                     while my_char != " ":
#                         quality += my_char
#                         offset += 1
#                         my_char = bar[j+offset]
#                     quality_line.append(quality)
#             # append to global list
#             if chord_line:
#                 all_chord.append(chord_line)
#             if quality_line:
#                 all_quality.append(quality_line)
    
#     return all_chord, all_quality