In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import random

In [2]:
#get data from grammar's csv file of a single type format
s_exp_features = pd.read_csv("test_files/s_exp_features_test.csv", index_col = 0)
s_exp_features.head()

Unnamed: 0,0,1,2,3,4,5
0,2.0,0.125,0.584635,4.0,0.0,0.0415
1,1.0,0.678385,-0.678385,0.0,0.0,0.4068
2,5.0,0.036458,-11.315104,1.5,1.0,1.5536
3,1.0,0.003906,-2.003906,0.0,0.0,0.0004
4,7.0,0.170573,-31.157552,4.0,2.0,1.562


In [3]:
k_size = int(len(s_exp_features) / 4) #k_size is arbitrarily set, equal to the number of clusters
print(k_size)
kmeans = KMeans(n_clusters=k_size).fit(s_exp_features)

23


In [4]:
#outputs which cluster number the data point is placed into
kmeans.labels_

array([ 6,  6,  6,  6,  9,  9,  9,  9, 17,  9,  0, 17,  9, 21, 17,  9, 18,
       14, 21, 17, 19, 18, 14,  0,  8,  8,  7, 14,  3,  8,  9, 14, 12,  9,
       13,  7, 13, 19, 12, 15, 21,  7, 15, 14, 12, 21, 12, 16, 13, 17, 18,
        5,  1,  1, 12,  1, 19, 20, 20,  3,  5,  5,  8,  7, 11,  0, 20, 16,
        0, 20, 15, 15,  4, 22, 12, 12,  0, 11, 12,  0,  5,  7, 18,  7,  2,
        2,  2,  1,  1,  1, 14, 10])

In [5]:
#kmeans.predict(fill in with our own 7 unit vector data), will classify it into cluster

In [6]:
class Node:
    def __init__(self, node_num):
        self.node_num = node_num
        self.s_exp = [] #list of s-exp-ids
        self.cpt = {} #maps from node_num to conditional probability
    
    def add_exp(self, s):
        self.s_exp.append(s)

In [7]:
#assume s_exp_labels is grammar output with 3 columns: s-exp, song-id, song-index
s_exp_labels = pd.read_csv('test_files/s_exp_test.csv', index_col=0)
s_exp_labels.head()

Unnamed: 0,exp,song_id,song_index
0,0 4 C|0.155|0.125 H|0.725|0.290,0,0
1,0 0 H|0.340|0.678,0,1
2,-3 0 R|0.174|0.036 R|0.164|0.255 R|0.195|0.443...,0,2
3,0 0 R|0.243|0.004,0,3
4,-3 7 C|0.073|0.171 C|0.130|0.263 C|0.185|0.408...,0,4


In [8]:
# don't really need a dictionary, can just index into the DataFrame like:
s_exp_labels.loc[4, 'exp']

'-3 7 C|0.073|0.171 C|0.130|0.263 C|0.185|0.408 R|0.091|0.656 R|0.138|0.757 C|0.089|0.908 R|0.333|0.996 '

In [9]:
node_objects = [Node(i) for i in range(k_size)] #list of nodes for the Markov chain
    
for i, label in enumerate(kmeans.labels_): #iterating through all the data points
    cluster_num = kmeans.labels_[i] #access the cluster num each data point corresponds to
    node_objects[label].add_exp(i)

In [11]:
#create cpt
for outer_node in node_objects:
    outer_node_count = 0
    for inner_node in node_objects:
        outer_node.cpt[inner_node.node_num] = 0.0
        for s_exp_outer in outer_node.s_exp:
            for s_exp_inner in inner_node.s_exp:
                if s_exp_labels.loc[s_exp_outer, 'song_id'] == s_exp_labels.loc[s_exp_inner, 'song_id']:
                    if s_exp_labels.loc[s_exp_inner, 'song_index'] - s_exp_labels.loc[s_exp_outer, 'song_index'] == 1:
                        outer_node_count += 1
                        outer_node.cpt[inner_node.node_num] += 1
    #creates the probability of going to the next node, not s-exp
    if outer_node_count:
        outer_node.cpt = {k: (v / outer_node_count) for k, v in outer_node.cpt.items()}

In [12]:
def weighted_random_by_dct(dct):
    rand_val = random.random() #random value between 0 and 1
    total = 0
    for k, v in dct.items():
        total += v
        if rand_val <= total: #if running total exceeds probability, that's what you want
            return k

In [13]:
#takes in a number and returns list of s-exp ids generated by the cpt ordering given by probabilities
#cpt jumps from s-exp to s-exp
#nodes are collection of s-exp (s-exp is 1 measure with all those features)
def sequence_s_expressions(n): #n is length of 
    s_exp_ids = [] #create empty list
    start = int(random.uniform(0, len(node_objects))) #random start node
    next_node = node_objects[start] #get that node
    for i in range(n):
        next_node_num = weighted_random_by_dct(next_node.cpt) #get the next node based on the current node's cpt
        next_node = node_objects[next_node_num] #set the next node variable to that node
        next_s_exp_id = random.choice(next_node.s_exp) #get a random s-expression in that node 
        s_exp_ids += [next_s_exp_id] #store id in the list
    return s_exp_ids

In [14]:
#dummy placeholder functions
possible_notes = lambda a, b : ['G']
select_note = lambda a, b, c, d: 'C'

In [15]:
#go from s-exp to producing actual notes by calling possible_notes which passes into select_note
def produce_notes(num_of_measures, list_of_chords): #length of chords list = num of measures
    notes_df = pd.DataFrame(columns=["note_name", "start_time", "duration"])
    s_exp_ids = sequence_s_expressions(num_of_measures)
    curr_note = "Bb4" #figure out how to start the curr_note initially
    row = 0
    for i in range(num_of_measures): #i refers to measure number
        s_exp = s_exp_labels.loc[s_exp_ids[i], 'exp'] #ith s-exp, which is a string
        split_list = s_exp.split(' ')
        min_slope, max_slope = split_list[0], split_list[1]
        for term in split_list[2:-1]:
            elements = term.split("|")
            category, start, duration = elements[0], elements[1], elements[2]
            poss_notes_list = possible_notes(list_of_chords[i], category)
            selected_note = select_note(poss_notes_list, curr_note, min_slope, max_slope)
            new_row = {'note_name': selected_note, 'start_time': float(start) + i, 'duration': duration}
            notes_df = notes_df.append(new_row, ignore_index = True)
            row += 1
            curr_note = selected_note
    return notes_df

notes_df = produce_notes(2, ['Bb', 'A'])
notes_df

Unnamed: 0,note_name,start_time,duration
0,C,0.095,0.121
1,C,0.434,0.311
2,C,0.134,0.767
3,C,1.158,0.341
4,C,1.225,0.516
5,C,1.254,0.773
