In [1]:
import pandas as pd
import numpy as np


#### Encoding/Cleaning
- load json data / basic cleaning (sort, drop id, reset indices etc.)
- calculate event duration and interarrival times
- convert and bin values
- concatenate rows into strings (words for NLP) and return as a pd.series

In [2]:
# sort, drop id column and reset index (supercollider seems to occasionally drop events )
df_raw = pd.read_json('./data/test.json',orient='index').sort_values('noteOn_timestamp').drop('id',axis=1).reset_index(drop=True)

#calculate duration: noteOn[i] - noteOff[i]
df_raw['duration'] = df_raw.apply(lambda row: row['noteOff_timestamp'] - row['noteOn_timestamp'],axis=1)

# calculate interevent duration (wait time between events): noteOn[i] - noteOn[i-1] ...
# shifting and then dividing by -1 
df_raw['inter_event_duration'] = (df_raw['noteOn_timestamp'] - df_raw['noteOn_timestamp'].shift(-1))/ -1

# timestamps no longer needed
df_raw = df_raw.drop(['noteOn_timestamp','noteOff_timestamp'],axis=1)

#new_index = 
binned_df = pd.DataFrame(columns=df_raw.columns)

In [3]:
import string
#used for amplitude
bin4 = [i for i in range(0,127,32)]
label4 = [string.ascii_lowercase[i] for i in range(3)]
# used for voices
bin8 = [i for i in range(0,127,16)]
label8 = [string.ascii_lowercase[i] for i in range(7)]

# used for grain rate/dur and deviations
bin16 = [i for i in range(0,127,8)]
label16 = [string.ascii_lowercase[i] for i in range(15)]

In [5]:
# load the binned df using cut -- keep freq as integer value (for now.. )
binned_df['amp'] = pd.cut(df_raw['amp'], bin4,labels=label4)
binned_df['freq_dev'] = pd.cut(df_raw['freq_dev'], bin16,labels=label16)
binned_df['grain_dur'] = pd.cut(df_raw['grain_dur'], bin16,labels=label16)
binned_df['grain_dur_dev'] = pd.cut(df_raw['grain_dur_dev'],bin16,labels=label16)
binned_df['grain_rate']  = pd.cut(df_raw['grain_rate'],bin16,labels=label16)
binned_df['grain_rate_dev'] = pd.cut(df_raw['grain_rate_dev'],bin16,labels=label16)
binned_df['n_voices'] = pd.cut(df_raw['n_voices'],bin8,labels=label8)
binned_df['rel'] = pd.cut(df_raw['rel'], bin4,labels=label4)

dur_bins = np.logspace(-1,1.25,27)  # need to test edge cases for this one.. 
dur_labels = [string.ascii_lowercase[i] for i in range(26)]
binned_df['duration'] = pd.cut(df_raw['duration'],dur_bins,labels=dur_labels)
binned_df['inter_event_duration'] = pd.cut(df_raw['inter_event_duration'],dur_bins,labels=dur_labels)

binned_df['freq'] = df_raw['freq']
binned_df.dropna()



Unnamed: 0,amp,freq,freq_dev,grain_dur,grain_dur_dev,grain_rate,grain_rate_dev,n_voices,rel,duration,inter_event_duration
0,b,72,d,d,d,c,c,a,a,p,u
1,b,78,i,d,d,d,f,b,b,r,w
2,b,69,e,f,i,d,c,e,a,p,s
3,b,73,e,f,i,f,c,e,a,i,m
4,b,74,l,f,i,f,c,e,a,k,q
5,b,70,l,f,i,f,c,g,b,p,s
6,b,79,l,f,c,f,c,g,b,m,r
7,b,84,l,f,c,d,c,g,b,i,i
8,b,83,l,f,c,d,c,g,b,i,m
9,b,75,e,f,c,d,c,g,b,m,o


In [6]:
cols = list(binned_df)
cols[1], cols[0] = cols[0], cols[1]
binned_df = binned_df.ix[:,cols]  # works for this sample, but may need to reorder if json file inputs differently
binned_df = binned_df.dropna().reset_index(drop=True)

In [25]:

word_series = binned_df.iloc[:,0].astype(str) + '_' # freq first then string of categories 
for i in range(1, len(binned_df.columns)):
    word_series += binned_df.iloc[:,i].astype(str)

In [31]:
word_series


0     72_bdddccaapu
1     78_bidddfbbrw
2     69_befidceaps
3     73_befifceaim
4     74_blfifceakq
5     70_blfifcgbps
6     79_blfcfcgbmr
7     84_blfcdcgbii
8     83_blfcdcgbim
9     75_befcdcgbmo
10    80_befcfcgbdn
11    78_bbccffgbnp
12    73_bbccfegbjm
13    68_bbccfegbkn
Name: freq, dtype: object

#### Decoding

- Specs for the binnings need to be stored in a config file/data structure 
- After tensorflow produces results they need to be decoded back to midi-json format
- midi values are randomized within the binned ranges and returned to supercollider
