In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import random

In [2]:
#get data from grammar's csv file of a single type format
s_exp_features = pd.read_csv("test_files/s_exp_features_test.csv", index_col = 0)
s_exp_features.head()
#at what index is the s-expression id in this? once we know this, we can add the s-expression id to each node's lsit of s-expressions

Unnamed: 0,0,1,2,3,4,5
0,2.0,0.125,0.584635,4.0,0.0,0.0415
1,1.0,0.678385,-0.678385,0.0,0.0,0.4068
2,5.0,0.036458,-11.315104,1.5,1.0,1.5536
3,1.0,0.003906,-2.003906,0.0,0.0,0.0004
4,7.0,0.170573,-31.157552,4.0,2.0,1.562


In [3]:
k_size = int(len(s_exp_features) / 4) #k_size is arbitrarily set, equal to the number of clusters
print(k_size)
kmeans = KMeans(n_clusters=k_size).fit(s_exp_features)

23


In [4]:
#outputs which cluster number the data point is placed into
kmeans.labels_

array([10, 10, 17, 10,  0, 17, 17,  0, 12, 17,  6, 12, 17,  0, 12,  0, 11,
       19,  0, 12, 20, 11, 19,  6,  4,  4, 18, 19,  8, 20,  0, 19,  2,  0,
        4, 18, 11, 20,  2, 21,  0, 18, 14, 19, 14, 12,  2, 13, 11, 12, 11,
        1,  9,  9, 14,  9, 20,  5,  5,  8,  1,  1,  4, 18, 16,  6,  5, 13,
        6, 15, 14, 14,  3, 22, 14,  2,  6, 16,  2,  6,  1, 18, 11, 18, 15,
       15, 15,  9,  9,  9, 19,  7])

In [5]:
#kmeans.predict(fill in with our own 7 unit vector data), will classify it into cluster

In [6]:
class Node:
    def __init__(self, node_num):
        self.node_num = node_num
        self.s_exp = [] #list of s-exp-ids
        self.cpt = {} #maps from node_num to conditional probability
        self.count = 0
    
    def add_exp(self, s):
        self.s_exp.append(s)

In [7]:
#dictionary that maps s-exp-id to: s-exp, song-id (which song), song-index (position in song)
s_exp_dict = {} 

#assume s_exp_labels is grammar output with 3 columns: s-exp, song-id, song-index
#s_exp_labels = np.loadtxt(open("test_files/test_exp.csv", "r"), dtype="str", delimiter=",", skiprows=1)
s_exp_labels = pd.read_csv('test_files/s_exp_test.csv', index_col=0)
s_exp_labels.head()

Unnamed: 0,exp,song_id,song_index
0,0 4 C|0.155|0.125 H|0.725|0.290,0,0
1,0 0 H|0.340|0.678,0,1
2,-3 0 R|0.174|0.036 R|0.164|0.255 R|0.195|0.443...,0,2
3,0 0 R|0.243|0.004,0,3
4,-3 7 C|0.073|0.171 C|0.130|0.263 C|0.185|0.408...,0,4


In [8]:
# for expression in s_exp_labels:
#     s_exp_dict[expression[0]] = [expression[1], expression[2], expression[3]]
for index, row in s_exp_labels.iterrows():
    s_exp_dict[index] = [row['exp'], row['song_id'], row['song_index']]

len(s_exp_dict)

# don't really need a dictionary, can just index into the DataFrame like:

s_exp_labels.loc[4, 'exp']

'-3 7 C|0.073|0.171 C|0.130|0.263 C|0.185|0.408 R|0.091|0.656 R|0.138|0.757 C|0.089|0.908 R|0.333|0.996 '

In [9]:
node_objects = [Node(i) for i in range(k_size)] #list of nodes for the Markov chain

# for cluster_num in range(k_size):
#     node_objects.append(Node(cluster_num)) #initialize 1 node for each cluster number
    
for i, label in enumerate(kmeans.labels_): #iterating through all the data points
    cluster_num = kmeans.labels_[i] #access the cluster num each data point corresponds to
    #adding s-exp, song-id (which song), song-index (position in song) to the node object
    node_objects[label].add_exp(i)#cluster_info[i][0]) #why string??? (previously)
    # find how to add the s-expression id correspondng to that node
    # index 0 is s-expression

In [10]:
#create cpt
for outer_node in node_objects:
    outer_node.count = 0
    for inner_node in node_objects:
        outer_node.cpt[inner_node.node_num] = 0.0
        #if (outer_node != inner_node):
        for s_exp_outer in outer_node.s_exp:
            for s_exp_inner in inner_node.s_exp:
                if s_exp_labels.loc[s_exp_outer, 'song_id'] == s_exp_labels.loc[s_exp_inner, 'song_id']:
                    if s_exp_labels.loc[s_exp_inner, 'song_index'] - s_exp_labels.loc[s_exp_outer, 'song_index'] == 1:
                        outer_node.count += 1
                        outer_node.cpt[inner_node.node_num] += 1
    #creates the probability of going to the next node, not s-exp
    if outer_node.count:
        outer_node.cpt = {k: (v / outer_node.count) for k, v in outer_node.cpt.items()}

In [11]:
def weighted_random_by_dct(dct):
    rand_val = random.random() #random value between 0 and 1
    total = 0
    for k, v in dct.items():
        total += v
        if rand_val <= total: #if running total exceeds probability, that's what you want
            return k

In [12]:
#takes in a number and returns list of s-exp ids generated by the cpt ordering given by probabilities
#cpt jumps from s-exp to s-exp
#nodes are collection of s-exp (s-exp is 1 measure with all those features)
def s_expressions(n): #n is length of 
    s_exp_ids = [] #create empty list
    start = int(random.uniform(0, len(node_objects))) #random start node
    next_node = node_objects[start] #get that node
    for i in range(n):
        next_node_num = weighted_random_by_dct(next_node.cpt) #get the next node based on the current node's cpt
        next_node = node_objects[next_node_num] #set the next node variable to that node
        next_s_exp_id = random.choice(next_node.s_exp) #get a random s-expression in that node 
        s_exp_ids += [next_s_exp_id] #store id in the list
    return s_exp_ids

In [13]:
possible_notes = lambda a, b : ['G']
select_note = lambda a, b, c, d: 'C'

In [24]:
#go from s-exp to producing actual notes by calling possible_notes which passes into select_note
def produce_notes(num_of_measures, list_of_chords): #length of chords list = num of measures
    notes_df = pd.DataFrame(columns=["note_name", "start_time", "duration"])
    s_exp_ids = s_expressions(num_of_measures)
    curr_note = "Bb4" #figure out how to start the curr_note initially
    row = 0
    for i in range(num_of_measures): #i refers to measure number
        s_exp = s_exp_labels.loc[s_exp_ids[i], 'exp'] #ith s-exp, which is a string
        split_list = s_exp.split(' ')
        min_slope, max_slope = split_list[0], split_list[1]
        for term in split_list[2:-1]:
            elements = term.split("|")
            category, start, duration = elements[0], elements[1], elements[2]
            poss_notes_list = possible_notes(list_of_chords[i], category)
            selected_note = select_note(poss_notes_list, curr_note, min_slope, max_slope)
            new_row = {'note_name': selected_note, 'start_time': float(start) + i, 'duration': duration}
            notes_df = notes_df.append(new_row, ignore_index = True)
            row += 1
            curr_note = selected_note
    return notes_df

notes_df = produce_notes(2, ['Bb', 'A'])
notes_df

9
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.3333333333333333, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.3333333333333333, 15: 0.3333333333333333, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 0.0}
9
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.5, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.16666666666666666, 15: 0.0, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.16666666666666666, 20: 0.16666666666666666, 21: 0.0, 22: 0.0}


Unnamed: 0,note_name,start_time,duration
0,C,0.137,0.09
1,C,0.224,0.232
2,C,0.154,0.746
3,C,1.057,0.129
4,C,1.152,0.197
5,C,1.064,0.349
6,C,1.238,0.423
7,C,1.264,0.71
