In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from hyperopt.pyll.base import scope
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
import shap
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
# import custom functions from src/utils.py
import sys
sys.path.append('../')
from src.utils import remove_correlated_features
from src.utils import plot_correlation_heatmap
from src.utils import remove_low_variance_features
from src.utils import plot_pca
from src.utils import plot_feature_importances
from src.utils import confusion_matrix_heatmap
from src.search_space_config import create_search_space
from src.utils import extract_features_from_mdi



In [2]:
# Process MIDI files and extract features
processed_data = pd.DataFrame()
for composer in ['Bach','Beethoven','Brahms','Schubert']:
    processed_data = pd.concat([processed_data, extract_features_from_mdi(f'../data/raw/Part1(PS1)/{composer}/', composer)], ignore_index=True)
# save processed data to csv
processed_data.to_csv('../data/processed/processed_data.csv', index=False)

Error processing ../data/raw/Part1(PS1)/Beethoven\Piano Sonata No 17 in D minor_OP31NO2_2391_ps17_01.mid: 2361926683664
Error processing ../data/raw/Part1(PS1)/Beethoven\Piano Sonata No 5 in C minor_OP10NO1_2529_ps05_03.mid: 2361879403920
Error processing ../data/raw/Part1(PS1)/Beethoven\Piano Sonata No 9 in E major_OP14NO1_2371_ps09_02.mid: 2361832714592
Error processing ../data/raw/Part1(PS1)/Beethoven\String Quartet No 12 in E-flat major_OP127_2365_qt12_1.mid: 2361869483072
Error processing ../data/raw/Part1(PS1)/Beethoven\String Quartet No 7 in F major_OP59NO1_2621_qt07_1.mid: 2361898299456
Error analyzing key for ../data/raw/Part1(PS1)/Brahms\Clarinet Sonata No 1 in F minor_OP120NO1_2116_brahms_sonata_opus_120-1.mid: 'Unpitched' object has no attribute 'pitch'
Error analyzing key for ../data/raw/Part1(PS1)/Brahms\Clarinet Sonata No 1 in F minor_OP120NO1_2118_brahms_sonata_opus_120-3.mid: 'Unpitched' object has no attribute 'pitch'
Error analyzing key for ../data/raw/Part1(PS1)/Bra

<ul>
    <li>The error messages produced above indicates that those mid files are corrupt, they are either unreadable or contain no musical element (no sound).</li>
    <li>I implemented the "extract_features_from_mdi" function to catch the errors during feature extraction.</li>
</ul>

In [3]:
processed_data = pd.read_csv('../data/processed/processed_data.csv')
processed_data

Unnamed: 0,key_name,key_mode,key_strength,num_key_signature_changes,most_frequent_key_signature,average_pitch,median_pitch,std_dev_pitch,pitch_range,unique_pitch_classes,...,most_common_instrument,average_tempo,min_tempo,max_tempo,tempo_variability,time_signature_changes,most_frequent_time_signature,measure_count,total_duration,composer
0,C,major,0.883717,4,0.0,52.871897,53.0,6.164130,31.0,12,...,Johann Sebastian Bach (1685-1750),52.500000,35.0,70.0,17.500000,4,3/4,0,264.0,Bach
1,C,major,0.886779,4,0.0,52.956790,53.0,6.183093,31.0,11,...,Johann Sebastian Bach (1685-1750),150.000000,50.0,250.0,100.000000,4,4/4,0,196.0,Bach
2,A,minor,0.877006,4,0.0,52.731313,53.0,6.011649,28.0,12,...,Johann Sebastian Bach (1685-1750),202.500000,155.0,250.0,47.500000,4,3/4,0,507.0,Bach
3,C,major,0.876151,4,0.0,53.013825,54.0,6.446861,31.0,12,...,Johann Sebastian Bach (1685-1750),40.000000,40.0,40.0,0.000000,4,3/4,0,144.0,Bach
4,C,major,0.838132,9,0.0,54.097643,55.0,5.423326,29.0,11,...,Johann Sebastian Bach (1685-1750),158.333333,75.0,250.0,71.686044,3,4/4,0,532.0,Bach
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,D,major,0.811751,4,2.0,62.860843,64.0,13.709972,71.0,12,...,Instrument17,185.000000,168.0,202.0,17.000000,4,4/4,0,1458.0,Schubert
185,D,major,0.817013,12,2.0,62.413847,64.0,12.497622,60.0,12,...,Instrument17,167.563636,154.0,180.0,8.448267,176,3/4,0,1510.75,Schubert
186,E-,major,0.930132,12,-3.0,67.584334,67.0,10.366027,55.0,12,...,Instrument17,126.000000,110.0,138.0,13.856406,144,3/4,0,507.0,Schubert
187,B-,major,0.890524,5,-2.0,66.388654,67.0,12.710969,72.0,12,...,Staff-3,170.527778,130.0,218.0,17.944337,35,2/4,0,1618.25,Schubert


In [4]:
# check for missing values or empty cells
processed_data.isnull().sum()

key_name                         3
key_mode                         3
key_strength                     0
num_key_signature_changes        0
most_frequent_key_signature     16
average_pitch                    0
median_pitch                     0
std_dev_pitch                    0
pitch_range                      0
unique_pitch_classes             0
pitch_entropy                    0
average_melodic_interval         0
note_density                     0
rhythmic_variance                0
rest_proportion                  0
chord_diversity                  0
most_common_chord               11
consonance_ratio                 0
num_instruments                  0
instrument_diversity             0
most_common_instrument           0
average_tempo                    0
min_tempo                        0
max_tempo                        0
tempo_variability                0
time_signature_changes           0
most_frequent_time_signature     0
measure_count                    0
total_duration      

In [5]:
# columns with missing values
missing_columns = processed_data.columns[processed_data.isnull().any()].tolist()
missing_columns

['key_name', 'key_mode', 'most_frequent_key_signature', 'most_common_chord']

<ul>
    <li>Dropping columns with high number of NAN values.</li>
    <li>Dropping "most_frequent_key_signature" and "most_common_chord" columns.</li>
</ul>

In [6]:
processed_data.drop(['most_frequent_key_signature','most_common_chord'], axis=1, inplace=True)
processed_data

Unnamed: 0,key_name,key_mode,key_strength,num_key_signature_changes,average_pitch,median_pitch,std_dev_pitch,pitch_range,unique_pitch_classes,pitch_entropy,...,most_common_instrument,average_tempo,min_tempo,max_tempo,tempo_variability,time_signature_changes,most_frequent_time_signature,measure_count,total_duration,composer
0,C,major,0.883717,4,52.871897,53.0,6.164130,31.0,12,3.134150,...,Johann Sebastian Bach (1685-1750),52.500000,35.0,70.0,17.500000,4,3/4,0,264.0,Bach
1,C,major,0.886779,4,52.956790,53.0,6.183093,31.0,11,3.075162,...,Johann Sebastian Bach (1685-1750),150.000000,50.0,250.0,100.000000,4,4/4,0,196.0,Bach
2,A,minor,0.877006,4,52.731313,53.0,6.011649,28.0,12,3.185360,...,Johann Sebastian Bach (1685-1750),202.500000,155.0,250.0,47.500000,4,3/4,0,507.0,Bach
3,C,major,0.876151,4,53.013825,54.0,6.446861,31.0,12,3.267104,...,Johann Sebastian Bach (1685-1750),40.000000,40.0,40.0,0.000000,4,3/4,0,144.0,Bach
4,C,major,0.838132,9,54.097643,55.0,5.423326,29.0,11,3.231178,...,Johann Sebastian Bach (1685-1750),158.333333,75.0,250.0,71.686044,3,4/4,0,532.0,Bach
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,D,major,0.811751,4,62.860843,64.0,13.709972,71.0,12,3.444292,...,Instrument17,185.000000,168.0,202.0,17.000000,4,4/4,0,1458.0,Schubert
185,D,major,0.817013,12,62.413847,64.0,12.497622,60.0,12,3.400973,...,Instrument17,167.563636,154.0,180.0,8.448267,176,3/4,0,1510.75,Schubert
186,E-,major,0.930132,12,67.584334,67.0,10.366027,55.0,12,3.135565,...,Instrument17,126.000000,110.0,138.0,13.856406,144,3/4,0,507.0,Schubert
187,B-,major,0.890524,5,66.388654,67.0,12.710969,72.0,12,3.322906,...,Staff-3,170.527778,130.0,218.0,17.944337,35,2/4,0,1618.25,Schubert


In [7]:
# rows with missing values
processed_data[processed_data.isnull().any(axis=1)]


Unnamed: 0,key_name,key_mode,key_strength,num_key_signature_changes,average_pitch,median_pitch,std_dev_pitch,pitch_range,unique_pitch_classes,pitch_entropy,...,most_common_instrument,average_tempo,min_tempo,max_tempo,tempo_variability,time_signature_changes,most_frequent_time_signature,measure_count,total_duration,composer
144,,,0.0,6,60.775749,61.0,12.467054,57.0,12,3.414932,...,No. 1,120.0,120.0,120.0,0.0,2,3/4,0,2152/3,Brahms
145,,,0.0,2,60.569524,61.0,12.578191,65.0,12,3.007152,...,grazioso,130.0,120.0,140.0,10.0,2,3/4,0,584.5,Brahms
146,,,0.0,2,62.545813,64.0,11.758678,69.0,12,3.200194,...,Opus 20,180.0,120.0,240.0,60.0,2,2/2,0,888.0,Brahms


<ul>
    <li>Dropping rows (datapoints) which have no musical key elements (no key name, no key mode), because these are not sound.</li>
    <li>Dropping rows 144 to 146.</li>
</ul>

In [8]:
# drop the rows with missing values
processed_data.dropna(inplace=True)
processed_data.reset_index(drop=True, inplace=True)
processed_data

Unnamed: 0,key_name,key_mode,key_strength,num_key_signature_changes,average_pitch,median_pitch,std_dev_pitch,pitch_range,unique_pitch_classes,pitch_entropy,...,most_common_instrument,average_tempo,min_tempo,max_tempo,tempo_variability,time_signature_changes,most_frequent_time_signature,measure_count,total_duration,composer
0,C,major,0.883717,4,52.871897,53.0,6.164130,31.0,12,3.134150,...,Johann Sebastian Bach (1685-1750),52.500000,35.0,70.0,17.500000,4,3/4,0,264.0,Bach
1,C,major,0.886779,4,52.956790,53.0,6.183093,31.0,11,3.075162,...,Johann Sebastian Bach (1685-1750),150.000000,50.0,250.0,100.000000,4,4/4,0,196.0,Bach
2,A,minor,0.877006,4,52.731313,53.0,6.011649,28.0,12,3.185360,...,Johann Sebastian Bach (1685-1750),202.500000,155.0,250.0,47.500000,4,3/4,0,507.0,Bach
3,C,major,0.876151,4,53.013825,54.0,6.446861,31.0,12,3.267104,...,Johann Sebastian Bach (1685-1750),40.000000,40.0,40.0,0.000000,4,3/4,0,144.0,Bach
4,C,major,0.838132,9,54.097643,55.0,5.423326,29.0,11,3.231178,...,Johann Sebastian Bach (1685-1750),158.333333,75.0,250.0,71.686044,3,4/4,0,532.0,Bach
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,D,major,0.811751,4,62.860843,64.0,13.709972,71.0,12,3.444292,...,Instrument17,185.000000,168.0,202.0,17.000000,4,4/4,0,1458.0,Schubert
182,D,major,0.817013,12,62.413847,64.0,12.497622,60.0,12,3.400973,...,Instrument17,167.563636,154.0,180.0,8.448267,176,3/4,0,1510.75,Schubert
183,E-,major,0.930132,12,67.584334,67.0,10.366027,55.0,12,3.135565,...,Instrument17,126.000000,110.0,138.0,13.856406,144,3/4,0,507.0,Schubert
184,B-,major,0.890524,5,66.388654,67.0,12.710969,72.0,12,3.322906,...,Staff-3,170.527778,130.0,218.0,17.944337,35,2/4,0,1618.25,Schubert


In [None]:
# conferm no missing values
processed_data.isnull().sum()

key_name                        0
key_mode                        0
key_strength                    0
num_key_signature_changes       0
average_pitch                   0
median_pitch                    0
std_dev_pitch                   0
pitch_range                     0
unique_pitch_classes            0
pitch_entropy                   0
average_melodic_interval        0
note_density                    0
rhythmic_variance               0
rest_proportion                 0
chord_diversity                 0
consonance_ratio                0
num_instruments                 0
instrument_diversity            0
most_common_instrument          0
average_tempo                   0
min_tempo                       0
max_tempo                       0
tempo_variability               0
time_signature_changes          0
most_frequent_time_signature    0
measure_count                   0
total_duration                  0
composer                        0
dtype: int64

In [11]:
processed_data['composer'].value_counts()

composer
Beethoven    127
Schubert      25
Bach          17
Brahms        17
Name: count, dtype: int64

In [13]:
# identify categorical features
categorical_features = processed_data.select_dtypes(include=['object']).columns
categorical_features

Index(['key_name', 'key_mode', 'note_density', 'most_common_instrument',
       'most_frequent_time_signature', 'total_duration', 'composer'],
      dtype='object')

<ul style="color:yellow">
    <li>"key_name" and "key_mode" will be treated as categorical variables and using one-hot encoding</li>
    <li>"note density" has an entry with a string format "8586/2125" which needs to be converted to float.</li>
    <li>I will also treat 'most_frequent_time_signature" as categoical feature beacuse the model to differentiate between distinct time signatures but not assume any inherent order or numerical relationship between them.</li>
    <li>I will convert "total_duration" column to float</li>
    <li> Composer colum is the target and I will handle it with label-encoding</li>
</ul>

In [None]:
# convert '8586/2125' in note_density to float(8586/2125)
processed_data['note_density'] = processed_data['note_density'].apply(lambda x: eval(x))
processed_data['note_density']

In [21]:
processed_data['most_frequent_time_signature'].unique()

array(['3/4', '4/4', '3/8', '12/8', '6/4', '2/4', '8/8', '6/8', '1/2',
       '1/4', '1/8', '12/16', '3/16', '4/8', '2/2', '9/8'], dtype=object)

In [23]:
# convert total duration to float
processed_data['total_duration'] = processed_data['total_duration'].apply(lambda x: eval(x))
processed_data['total_duration']

0       264.00
1       196.00
2       507.00
3       144.00
4       532.00
        ...   
181    1458.00
182    1510.75
183     507.00
184    1618.25
185     576.00
Name: total_duration, Length: 186, dtype: float64

In [15]:
# convert note_density to float
processed_data['note_density'] = processed_data['note_density'].astype(float)
processed_data['note_density']

ValueError: could not convert string to float: '8586/2125'

0      3.814394
1      3.908163
2      1.952663
3      2.972222
4      1.674812
         ...   
181    4.063100
182    2.194936
183    2.285996
184    5.025799
185    5.664931
Name: note_density, Length: 186, dtype: float64

In [12]:
processed_data.describe()

Unnamed: 0,key_strength,num_key_signature_changes,average_pitch,median_pitch,std_dev_pitch,pitch_range,unique_pitch_classes,pitch_entropy,average_melodic_interval,rhythmic_variance,...,chord_diversity,consonance_ratio,num_instruments,instrument_diversity,average_tempo,min_tempo,max_tempo,tempo_variability,time_signature_changes,measure_count
count,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,...,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0
mean,0.884598,6.419355,62.982724,63.475806,12.085782,59.833333,11.973118,3.247129,-0.002319,0.233301,...,31.231183,0.071158,3.258065,3.258065,114.752629,69.552903,147.529086,17.592025,9.548387,0.0
std,0.051661,7.161945,3.511224,3.830278,2.205774,10.490107,0.162174,0.135767,0.006982,0.183454,...,20.019593,0.072535,1.589854,1.589854,60.592798,49.325959,73.747209,17.140064,22.838808,0.0
min,0.691156,0.0,52.652174,53.0,5.27221,25.0,11.0,2.832748,-0.025,0.001716,...,0.0,0.0,1.0,1.0,17.385321,10.0,20.0,0.0,1.0,0.0
25%,0.854379,2.0,61.429892,62.0,11.261471,58.0,12.0,3.166554,-0.005125,0.090675,...,19.0,0.012137,2.0,2.0,62.105263,30.0,90.0,6.221866,3.0,0.0
50%,0.89141,4.0,63.245311,64.0,12.439096,60.0,12.0,3.253541,-0.001412,0.191507,...,30.0,0.055046,3.0,3.0,108.864672,50.0,149.0,12.474817,4.0,0.0
75%,0.923469,7.0,64.979936,66.0,13.370445,65.75,12.0,3.332769,0.000703,0.319826,...,44.0,0.111111,4.0,4.0,149.727273,100.0,200.0,23.37958,6.75,0.0
max,0.967793,36.0,72.565085,73.0,16.131535,77.0,12.0,3.553842,0.028152,0.905232,...,115.0,0.430127,9.0,9.0,300.0,300.0,320.0,100.0,176.0,0.0
