In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import pickle as pkl

dataset_path_15sec_clips = './DataStreamFeatures/TrainData_MusicNet_15sec_313.pkl'

with open(dataset_path_15sec_clips, "rb") as f:
    train_X, train_Y = pkl.load(f)
    
m , n = train_X.shape   # number of samples, total availabl number of features

print(f'{train_X.shape=} {train_Y.shape=}')

print(f'{m=} training examples with {n=} features each')

train_X.shape=(313, 15) train_Y.shape=(313,)
m=313 training examples with n=15 features each


In [3]:
# note see Feature Extraction Pipeline, where this encoding order is set... 
Composers = ['Bach','Beethoven','Brahms','Schubert']

label_encodings = dict()

for index,composer in enumerate(Composers):
    
    label_encodings[index]=composer

print(label_encodings)

{0: 'Bach', 1: 'Beethoven', 2: 'Brahms', 3: 'Schubert'}


In [5]:
column_names = ['Range',
 'PitchVariety',
 'NumberOfCommonPitches',
 'NoteDensity',
 'AmountOfArpeggiation',
 'IntervalBetweenStrongestPitches',
 'MostCommonPitchClassPrevalence',
 'MostCommonPitchPrevalence',
 'RelativeStrengthOfTopPitchClasses',
 'RepeatedNotes',
 'MelodicOctaves',
 'SizeOfMelodicArcs',
 'VariabilityOfNoteDuration',
 'MelodicTritones',
 'PitchClassVariety']


In [6]:
df = pd.DataFrame(train_X, columns = column_names)

df['Composer_label']=train_Y

series = pd.Series(train_Y)

df['Composer_name']= series.map(label_encodings)

df.head(10)

Unnamed: 0,Range,PitchVariety,NumberOfCommonPitches,NoteDensity,AmountOfArpeggiation,IntervalBetweenStrongestPitches,MostCommonPitchClassPrevalence,MostCommonPitchPrevalence,RelativeStrengthOfTopPitchClasses,RepeatedNotes,MelodicOctaves,SizeOfMelodicArcs,VariabilityOfNoteDuration,MelodicTritones,PitchClassVariety,Composer_label,Composer_name
0,39.0,30.0,3.0,13.75,0.221198,2.0,0.145455,0.104545,0.96875,0.013825,0.004608,5.340206,0.171883,0.018433,11.0,0.0,Bach
1,39.0,32.0,0.0,13.3125,0.214286,17.0,0.173709,0.079812,0.945946,0.009524,0.004762,5.764045,0.136698,0.004762,11.0,0.0,Bach
2,39.0,30.0,3.0,13.75,0.221198,2.0,0.145455,0.104545,0.96875,0.013825,0.004608,5.340206,0.171883,0.018433,11.0,0.0,Bach
3,36.0,27.0,0.0,13.6875,0.189815,1.0,0.164384,0.086758,0.972222,0.009259,0.009259,5.069307,0.176578,0.00463,10.0,0.0,Bach
4,39.0,32.0,0.0,13.4375,0.231132,17.0,0.176744,0.07907,0.921053,0.009434,0.004717,5.684211,0.141548,0.014151,11.0,0.0,Bach
5,39.0,32.0,0.0,13.25,0.22488,17.0,0.169811,0.089623,1.0,0.014354,0.004785,5.747253,0.169896,0.014354,11.0,0.0,Bach
6,39.0,31.0,3.0,13.5625,0.219626,2.0,0.147465,0.110599,1.0,0.018692,0.0,5.315789,0.178937,0.018692,11.0,0.0,Bach
7,36.0,26.0,1.0,13.8125,0.178899,4.0,0.18552,0.099548,0.829268,0.004587,0.009174,5.0,0.160251,0.004587,10.0,0.0,Bach
8,40.0,32.0,0.0,11.875,0.117647,2.0,0.168421,0.089474,0.96875,0.02139,0.0,5.130435,0.09678,0.016043,11.0,0.0,Bach
9,45.0,39.0,0.0,13.8125,0.133028,8.0,0.162896,0.085973,0.805556,0.013761,0.009174,5.5,0.07414,0.022936,12.0,0.0,Bach


In [7]:
# Get one hot encoding of columns B
one_hot = pd.get_dummies(df['Composer_name'])
# Drop column B as it is now encoded
#df = df.drop('Composer_name',axis = 1)
# Join the encoded df
df = df.join(one_hot)

In [8]:
df[Composers].head()

Unnamed: 0,Bach,Beethoven,Brahms,Schubert
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [9]:
df_best_features = df[column_names+['Composer_name','Composer_label']]

In [10]:
df_best_features.head()

Unnamed: 0,Range,PitchVariety,NumberOfCommonPitches,NoteDensity,AmountOfArpeggiation,IntervalBetweenStrongestPitches,MostCommonPitchClassPrevalence,MostCommonPitchPrevalence,RelativeStrengthOfTopPitchClasses,RepeatedNotes,MelodicOctaves,SizeOfMelodicArcs,VariabilityOfNoteDuration,MelodicTritones,PitchClassVariety,Composer_name,Composer_label
0,39.0,30.0,3.0,13.75,0.221198,2.0,0.145455,0.104545,0.96875,0.013825,0.004608,5.340206,0.171883,0.018433,11.0,Bach,0.0
1,39.0,32.0,0.0,13.3125,0.214286,17.0,0.173709,0.079812,0.945946,0.009524,0.004762,5.764045,0.136698,0.004762,11.0,Bach,0.0
2,39.0,30.0,3.0,13.75,0.221198,2.0,0.145455,0.104545,0.96875,0.013825,0.004608,5.340206,0.171883,0.018433,11.0,Bach,0.0
3,36.0,27.0,0.0,13.6875,0.189815,1.0,0.164384,0.086758,0.972222,0.009259,0.009259,5.069307,0.176578,0.00463,10.0,Bach,0.0
4,39.0,32.0,0.0,13.4375,0.231132,17.0,0.176744,0.07907,0.921053,0.009434,0.004717,5.684211,0.141548,0.014151,11.0,Bach,0.0


In [11]:
data_dir = './DataStreamFeatures/DataFrames/'

import os  
os.makedirs(data_dir, exist_ok=True)  

df_best_features.to_csv(data_dir+'BestMusicFeatures_15seconds_MusicNet_Kaggle_313.csv',index=False)