In [1]:
!pip install librosa
!pip install soundfile



In [4]:
import os
import librosa
import numpy as np
import pandas as pd

pathToAllFiles = '../Downloads/genres'
allGenres = os.listdir(pathToAllFiles)
print(allGenres)

['pop', 'metal', 'disco', 'blues', 'reggae', 'classical', 'rock', 'hiphop', 'country', 'jazz']


In [69]:
chroma_features = ['C', 'Csharp', 'D', 'Dsharp', 'E', 'F', 'Fsharp', 'G', 'Gsharp', 'A', 'Asharp', 'B'] #Each chroma value represents the intensity (or strength) of the corresponding pitch class in the audio signal.
tonnetz_len = 6 #The Tonnetz (Tonal Centroids) representation typically consists of six features. These features represent the summary statistics of the tonal content of the audio signal. Each feature captures different aspects of the tonal characteristics of the audio.
mfcc_len = 20 #The Mel-frequency cepstral coefficients (MFCCs) are typically computed as a set of coefficients that capture various aspects of the spectral characteristics of an audio signal, particularly those that are relevant to human perception of sound.




columns = ['filename']
columns += [f'chroma_{chroma_features[i]}' for i in range(len(chroma_features))]
columns += ['chroma_allPitches']
columns += [f'tonnetz_dim{i}' for i in range(1, tonnetz_len + 1)]
columns += ['tonnetz_allDims', 'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff', 'rmse']
columns += [f'mfcc{i}' for i in range(1, mfcc_len + 1)]
columns += ['genre']
print(len(columns))

46


In [57]:
allRowsOrig = []
for genre in allGenres:
    print(f'{genre} started')
    pathToGenreFiles = os.path.join(pathToAllFiles, genre)
    for file in os.listdir(pathToGenreFiles):
        pathToFile = os.path.join(pathToGenreFiles, file)

        try:
            row = []
            y, sr = librosa.load(pathToFile, res_type='kaiser_fast')

            row.append(file)
            
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            chroma_mean_for_different_pitches = np.mean(chroma, axis=1)
            row += chroma_mean_for_different_pitches.tolist()
            
            chroma_mean_all = np.mean(chroma)
            row.append(chroma_mean_all)

            tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
            tonnetz_mean_each_dimension = np.mean(tonnetz, axis=1)
            row += tonnetz_mean_each_dimension.tolist()

            tonnetz_mean_all = np.mean(tonnetz)
            row.append(tonnetz_mean_all)
            
            spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
            row.append(np.mean(spectral_centroids))
            
            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
            row.append(np.mean(spectral_bandwidth))
            
            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
            row.append(np.mean(spectral_rolloff))
            
            rmse = librosa.feature.rms(y=y)
            row.append(np.mean(rmse))
            
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
            mfcc_mean_each_dimension = np.mean(mfcc, axis=1)
            row += mfcc_mean_each_dimension.tolist()
            
            row.append(genre) #this will be the last column, it will be the "y" column
            
            #print(len(row))
            allRowsOrig.append(row)

        except Exception as e:
            print("Exception thrown: ", e)
            continue
print(allRowsOrig)

pop started
metal started
disco started
blues started
reggae started
classical started
rock started
hiphop started
country started
jazz started
[['pop.00080.au', 0.39151230454444885, 0.429697185754776, 0.524728536605835, 0.38569796085357666, 0.3663402497768402, 0.4338560700416565, 0.44559744000434875, 0.5933113694190979, 0.43868547677993774, 0.4130147397518158, 0.4182511270046234, 0.3617902398109436, 0.4335403, 0.06444818637471501, 0.07851908530826449, -0.010317929500350043, 0.053865077390356286, 0.023850091866197416, -0.03303237211093167, 0.029555356554708492, 3151.406048473924, 3220.6233818211003, 7294.5599124492455, 0.21566187, -44.98688888549805, 69.77589416503906, 11.84067153930664, 8.117905616760254, 12.628617286682129, 7.812100410461426, 11.339221000671387, 0.18483147025108337, 9.635790824890137, 2.840994358062744, 3.853339195251465, -0.8417093753814697, 2.132197618484497, 1.0832056999206543, -0.07710439711809158, 0.6480727195739746, -1.4845343828201294, 3.7256293296813965, 4.55

In [65]:
dfOrig = pd.DataFrame(allRowsOrig, columns=columns)
print(len(dfOrig))

csv_file = 'dataOrig.csv'
dfOrig.to_csv(csv_file, index=False)

1000


In [68]:
dfOrig.head()

Unnamed: 0,filename,chroma_C,chroma_Csharp,chroma_D,chroma_Dsharp,chroma_E,chroma_F,chroma_Fsharp,chroma_G,chroma_Gsharp,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,pop.00080.au,0.391512,0.429697,0.524729,0.385698,0.36634,0.433856,0.445597,0.593311,0.438685,...,-0.841709,2.132198,1.083206,-0.077104,0.648073,-1.484534,3.725629,4.553656,4.274244,pop
1,pop.00022.au,0.544175,0.420254,0.424198,0.399732,0.300949,0.312941,0.363381,0.445987,0.342582,...,2.602134,4.048971,2.384604,1.780818,-2.392593,-1.346176,-2.879179,-0.849864,-2.598134,pop
2,pop.00073.au,0.344232,0.477342,0.461315,0.383294,0.430534,0.360271,0.595679,0.406584,0.354102,...,0.874806,-1.936897,-2.78313,-2.666644,-3.311479,1.708261,-0.152766,4.84682,2.906235,pop
3,pop.00047.au,0.392071,0.428158,0.523789,0.385555,0.366227,0.433113,0.444056,0.591267,0.437267,...,-0.844541,2.09968,1.122588,0.006671,0.74826,-1.474304,3.672843,4.469373,4.198666,pop
4,pop.00016.au,0.396111,0.442692,0.408399,0.486187,0.353345,0.326342,0.372,0.300399,0.334868,...,-0.441958,-1.08406,-3.450706,-2.475977,-3.302751,-2.228499,-2.649557,-2.744086,0.305748,pop


# Pre-processing: using harmonic of audio file

This is basically the same code as above, but this time we're isolates the harmonic component of the audio signal and removes any percussive or noise components.

The main difference in this code vs the code in the previous section is that, here we're adding
harmonic_only = librosa.effects.harmonic(y)

In [66]:
allRowsHarmonic = []

In [67]:
for genre in allGenres:
    print(f'{genre} started')
    pathToGenreFiles = os.path.join(pathToAllFiles, genre)
    for file in os.listdir(pathToGenreFiles):
        pathToFile = os.path.join(pathToGenreFiles, file)

        try:
            row = []
            y, sr = librosa.load(pathToFile, res_type='kaiser_fast')

            row.append(file)
            
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            chroma_mean_for_different_pitches = np.mean(chroma, axis=1)
            row += chroma_mean_for_different_pitches.tolist()
            
            chroma_mean_all = np.mean(chroma)
            row.append(chroma_mean_all)
            
            harmonic_only = librosa.effects.harmonic(y)

            tonnetz = librosa.feature.tonnetz(y=harmonic_only, sr=sr)
            tonnetz_mean_each_dimension = np.mean(tonnetz, axis=1)
            row += tonnetz_mean_each_dimension.tolist()

            tonnetz_mean_all = np.mean(tonnetz)
            row.append(tonnetz_mean_all)
            
            spectral_centroids = librosa.feature.spectral_centroid(y=harmonic_only, sr=sr)
            row.append(np.mean(spectral_centroids))
            
            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=harmonic_only, sr=sr)
            row.append(np.mean(spectral_bandwidth))
            
            spectral_rolloff = librosa.feature.spectral_rolloff(y=harmonic_only, sr=sr)
            row.append(np.mean(spectral_rolloff))
            
            rmse = librosa.feature.rms(y=harmonic_only)
            row.append(np.mean(rmse))
            
            mfcc = librosa.feature.mfcc(y=harmonic_only, sr=sr, n_mfcc=20)
            mfcc_mean_each_dimension = np.mean(mfcc, axis=1)
            row += mfcc_mean_each_dimension.tolist()
            
            row.append(genre) #this will be the last column, it will be the "y" column
            
            #print(len(row))
            allRowsHarmonic.append(row)

        except Exception as e:
            print("Exception thrown: ", e)
            continue
print(len(allRowsHarmonic))

pop started
metal started
disco started
blues started
reggae started
classical started
rock started
hiphop started
country started
jazz started
1000


In [74]:
dfHarmonic = pd.DataFrame(allRowsHarmonic, columns=columns)
csv_file = 'dataForHarmonicAudio.csv'
dfHarmonic.to_csv(csv_file, index=False)
print(len(dfHarmonic))

1000


In [75]:
dfHarmonic.head()

Unnamed: 0,filename,chroma_C,chroma_Csharp,chroma_D,chroma_Dsharp,chroma_E,chroma_F,chroma_Fsharp,chroma_G,chroma_Gsharp,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,pop.00080.au,0.391512,0.429697,0.524729,0.385698,0.36634,0.433856,0.445597,0.593311,0.438685,...,0.164212,2.695709,1.318642,0.632315,0.957001,-1.566437,6.275709,7.807974,6.363476,pop
1,pop.00022.au,0.544175,0.420254,0.424198,0.399732,0.300949,0.312941,0.363381,0.445987,0.342582,...,5.130442,6.827752,3.071533,3.366904,-2.422404,-2.135466,-3.336569,-0.564991,-3.893568,pop
2,pop.00073.au,0.344232,0.477342,0.461315,0.383294,0.430534,0.360271,0.595679,0.406584,0.354102,...,1.111607,-2.048577,-4.366745,-3.73261,-3.398216,2.417332,1.289159,7.578645,4.909898,pop
3,pop.00047.au,0.392071,0.428158,0.523789,0.385555,0.366227,0.433113,0.444056,0.591267,0.437267,...,0.235207,2.670471,1.338333,0.699207,1.112011,-1.565463,6.195435,7.648764,6.243628,pop
4,pop.00016.au,0.396111,0.442692,0.408399,0.486187,0.353345,0.326342,0.372,0.300399,0.334868,...,-0.269821,-1.64258,-4.781828,-3.302547,-4.714705,-3.559506,-3.702941,-4.042264,0.333003,pop


In [5]:
import os
import librosa
import numpy as np
import pandas as pd
pathToAllFiles = '../Downloads/genres'
allGenres = os.listdir(pathToAllFiles)
allRowsDeltas = []
for genre in allGenres:
    print(f'{genre} started')
    pathToGenreFiles = os.path.join(pathToAllFiles, genre)
    for file in os.listdir(pathToGenreFiles):
        pathToFile = os.path.join(pathToGenreFiles, file)

        try:
            row = []
            y, sr = librosa.load(pathToFile, res_type='kaiser_fast')
            row.append(file)
            
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
            delta_mfcc = librosa.feature.delta(mfcc)
            row += np.mean(delta_mfcc, axis=1).tolist()
            
            row.append(genre) #this will be the last column, it will be the "y" column
            
            allRowsDeltas.append(row)

        except Exception as e:
            print("Exception thrown: ", e)
            continue

pop started
metal started
disco started
blues started
reggae started
classical started
rock started
hiphop started
country started
jazz started


In [9]:
mfcc_len = 20
columnsDeltas = ['filename']
columnsDeltas += [f'mfcc{i}_delta' for i in range(1, mfcc_len + 1)]
columnsDeltas += ['genre']
print(len(columnsDeltas))

dfMFCCDeltas = pd.DataFrame(allRowsDeltas, columns=columnsDeltas)
csv_file = 'dataForMFCCDeltas.csv'
dfMFCCDeltas.to_csv(csv_file, index=False)
dfMFCCDeltas.head()

22


Unnamed: 0,filename,mfcc1_delta,mfcc2_delta,mfcc3_delta,mfcc4_delta,mfcc5_delta,mfcc6_delta,mfcc7_delta,mfcc8_delta,mfcc9_delta,...,mfcc12_delta,mfcc13_delta,mfcc14_delta,mfcc15_delta,mfcc16_delta,mfcc17_delta,mfcc18_delta,mfcc19_delta,mfcc20_delta,genre
0,pop.00080.au,0.105265,0.006372,0.029301,-0.007364,0.029296,0.032802,0.009167,-0.024216,-0.010839,...,-0.009615,-0.006449,-0.022793,-0.02955,-0.035698,-0.004061,0.015403,0.027763,0.015784,pop
1,pop.00022.au,0.104148,-0.016984,0.01399,0.00135,0.002648,-0.009762,-0.0111,0.001414,-0.00945,...,-0.00243,-0.005793,-0.01306,-0.011172,0.002258,-0.003963,-0.001441,-0.002932,0.00043,pop
2,pop.00073.au,0.029883,0.028489,-0.009955,0.002261,-0.003852,0.027426,0.019771,0.010638,0.019054,...,0.006772,0.004722,0.002172,0.008963,0.016858,0.007511,0.007263,0.006925,0.013436,pop
3,pop.00047.au,-0.018792,0.010139,0.01476,0.006951,0.015445,0.017319,-0.025722,-0.025269,-0.018115,...,-0.003096,-0.00507,-0.003745,-0.017687,-0.029552,0.005382,0.010811,0.009575,0.001937,pop
4,pop.00016.au,0.056777,0.052818,-0.01021,-0.004239,-0.001627,0.034322,-0.005006,0.00821,-0.004956,...,-0.004104,0.000187,0.011135,-0.00623,-0.003322,0.004229,-0.003618,-0.006311,0.007206,pop
