In [1]:
import pandas as pd
import numpy as np


#### Encoding/Cleaning
- load json data / basic cleaning (sort, drop id, reset indices etc.)
- calculate event duration and interarrival times
- convert and bin values
- concatenate rows into strings (words for NLP) and return as a pd.series

In [9]:
# sort, drop id column and reset index (supercollider seems to occasionally drop events )
df_raw = pd.read_json('./data/test.json',orient='index').sort_values('noteOn_timestamp').drop('id',axis=1).reset_index(drop=True)

#calculate duration: noteOn[i] - noteOff[i]
df_raw['duration'] = df_raw.apply(lambda row: row['noteOff_timestamp'] - row['noteOn_timestamp'],axis=1)

# calculate interevent duration (wait time between events): noteOn[i] - noteOn[i-1] ...
# shifting and then dividing by -1 
df_raw['inter_event_duration'] = (df_raw['noteOn_timestamp'] - df_raw['noteOn_timestamp'].shift(-1))/ -1

# timestamps no longer needed
df_raw = df_raw.drop(['noteOn_timestamp','noteOff_timestamp'],axis=1)

#new_index = 
binned_df = pd.DataFrame(columns=df_raw.columns)

In [3]:
import string
#used for amplitude
bin4 = [i for i in range(0,127,32)]
label4 = [string.ascii_lowercase[i] for i in range(3)]
# used for voices
bin8 = [i for i in range(0,127,16)]
label8 = [string.ascii_lowercase[i] for i in range(7)]

# used for grain rate/dur and deviations
bin16 = [i for i in range(0,127,8)]
label16 = [string.ascii_lowercase[i] for i in range(15)]

dur_bins = np.logspace(-1,1.25,12)  # need to test edge cases for this one.. 
dur_labels = [string.ascii_lowercase[i] for i in range(11)]

In [51]:
# load the binned df using cut -- keep freq as integer value (for now.. )
binned_df['amp'] = pd.cut(df_raw['amp'], bin4,labels=label4)
binned_df['freq_dev'] = pd.cut(df_raw['freq_dev'], bin16,labels=label16)
binned_df['grain_dur'] = pd.cut(df_raw['grain_dur'], bin16,labels=label16)
binned_df['grain_dur_dev'] = pd.cut(df_raw['grain_dur_dev'],bin16,labels=label16)
binned_df['grain_rate']  = pd.cut(df_raw['grain_rate'],bin16,labels=label16)
binned_df['grain_rate_dev'] = pd.cut(df_raw['grain_rate_dev'],bin16,labels=label16)
binned_df['n_voices'] = pd.cut(df_raw['n_voices'],bin8,labels=label8)
binned_df['rel'] = pd.cut(df_raw['rel'], bin4,labels=label4)

binned_df['duration'] = pd.cut(df_raw['duration'],dur_bins,labels=dur_labels)
binned_df['inter_event_duration'] = pd.cut(df_raw['inter_event_duration'],dur_bins,labels=dur_labels)

binned_df['freq'] = df_raw['freq']
binned_df.dropna()

Unnamed: 0,freq,amp,freq_dev,grain_dur,grain_dur_dev,grain_rate,grain_rate_dev,n_voices,rel,duration,inter_event_duration
0,72,b,d,d,d,c,c,a,a,g,i
1,78,b,i,d,d,d,f,b,b,h,j
2,69,b,e,f,i,d,c,e,a,g,h
3,73,b,e,f,i,f,c,e,a,d,f
4,74,b,l,f,i,f,c,e,a,e,h
5,70,b,l,f,i,f,c,g,b,g,h
6,79,b,l,f,c,f,c,g,b,f,h
7,84,b,l,f,c,d,c,g,b,d,d
8,83,b,l,f,c,d,c,g,b,d,f
9,75,b,e,f,c,d,c,g,b,f,g


In [11]:
cols = list(binned_df)
cols[1], cols[0] = cols[0], cols[1]
binned_df = binned_df.ix[:,cols]  # works for this sample, but may need to reorder if json file inputs differently
binned_df = binned_df.dropna().reset_index(drop=True)

In [12]:

word_series = binned_df.iloc[:,0].astype(str) + '_' # freq first then string of categories 
for i in range(1, len(binned_df.columns)):
    word_series += binned_df.iloc[:,i].astype(str)

In [126]:
word_series

import re
bin_list = pd.cut(df_raw['grain_dur'], bin16).cat.categories.tolist()

test = '[0.1, 0.5)'
re.findall('[-+]?\d+[\.]?\d*',test)
# [i for j in xx for i in k]
bin_list = [re.findall('[-+]?\d+[\.]?\d*',i) for i in bin_list]

for i in range(len(bin_list)):
    for j in range(len(bin_list[i])):
        bin_list[i][j] = int(bin_list[i][j])
    bin_list[i] = tuple(bin_list[i])
bin_list

[(0, 8),
 (8, 16),
 (16, 24),
 (24, 32),
 (32, 40),
 (40, 48),
 (48, 56),
 (56, 64),
 (64, 72),
 (72, 80),
 (80, 88),
 (88, 96),
 (96, 104),
 (104, 112),
 (112, 120)]

#### Decoding

- Specs for the binnings need to be stored in a config file/data structure 
- After tensorflow produces results they need to be decoded back to midi-json format
- midi values are randomized within the binned ranges and returned to supercollider
