In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

In [4]:
data = pd.read_csv("../feature_extraction/dataForHarmonicAudio.csv",usecols=lambda column: column != 'filename')

In [5]:
data.head()

Unnamed: 0,chroma_C,chroma_Csharp,chroma_D,chroma_Dsharp,chroma_E,chroma_F,chroma_Fsharp,chroma_G,chroma_Gsharp,chroma_A,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,0.391512,0.429697,0.524729,0.385698,0.36634,0.433856,0.445597,0.593311,0.438685,0.413015,...,0.164212,2.695709,1.318642,0.632315,0.957001,-1.566437,6.275709,7.807974,6.363476,pop
1,0.544175,0.420254,0.424198,0.399732,0.300949,0.312941,0.363381,0.445987,0.342582,0.325424,...,5.130442,6.827752,3.071533,3.366904,-2.422404,-2.135466,-3.336569,-0.564991,-3.893568,pop
2,0.344232,0.477342,0.461315,0.383294,0.430534,0.360271,0.595679,0.406584,0.354102,0.464613,...,1.111607,-2.048577,-4.366745,-3.73261,-3.398216,2.417332,1.289159,7.578645,4.909898,pop
3,0.392071,0.428158,0.523789,0.385555,0.366227,0.433113,0.444056,0.591267,0.437267,0.413407,...,0.235207,2.670471,1.338333,0.699207,1.112011,-1.565463,6.195435,7.648764,6.243628,pop
4,0.396111,0.442692,0.408399,0.486187,0.353345,0.326342,0.372,0.300399,0.334868,0.334352,...,-0.269821,-1.64258,-4.781828,-3.302547,-4.714705,-3.559506,-3.702941,-4.042264,0.333003,pop


In [9]:
X = data.iloc[:, :-1]  # the last column is the label
y = data.iloc[:, -1]  # the label

# Pre-processing

## Doing one-hot encoding

Checking which features are categorical

In [8]:
for column in data.columns:
    unique_values = data[column].unique()
    print(f"Column '{column}': {len(unique_values)}")

Column 'chroma_C': 986
Column 'chroma_Csharp': 986
Column 'chroma_D': 986
Column 'chroma_Dsharp': 986
Column 'chroma_E': 986
Column 'chroma_F': 986
Column 'chroma_Fsharp': 986
Column 'chroma_G': 986
Column 'chroma_Gsharp': 986
Column 'chroma_A': 986
Column 'chroma_Asharp': 986
Column 'chroma_B': 986
Column 'chroma_allPitches': 986
Column 'tonnetz_dim1': 986
Column 'tonnetz_dim2': 986
Column 'tonnetz_dim3': 986
Column 'tonnetz_dim4': 986
Column 'tonnetz_dim5': 986
Column 'tonnetz_dim6': 986
Column 'tonnetz_allDims': 986
Column 'spectral_centroid': 986
Column 'spectral_bandwidth': 986
Column 'spectral_rolloff': 983
Column 'rmse': 986
Column 'mfcc1': 986
Column 'mfcc2': 986
Column 'mfcc3': 986
Column 'mfcc4': 986
Column 'mfcc5': 986
Column 'mfcc6': 986
Column 'mfcc7': 986
Column 'mfcc8': 986
Column 'mfcc9': 986
Column 'mfcc10': 986
Column 'mfcc11': 986
Column 'mfcc12': 986
Column 'mfcc13': 986
Column 'mfcc14': 986
Column 'mfcc15': 986
Column 'mfcc16': 986
Column 'mfcc17': 986
Column 'mfcc

Finding which genres appear most commonly

In [10]:
label_counts = y.value_counts()

print("Frequency of each label:")
print(label_counts)

Frequency of each label:
genre
pop          100
metal        100
disco        100
blues        100
reggae       100
classical    100
rock         100
hiphop       100
country      100
jazz         100
Name: count, dtype: int64


Feature selection

1st way - using .corr - got this from hw 2 q3

In [13]:
mappingGenresToIntegers = {
    'pop': 0,
    'metal': 1,
    'disco': 2,
    'blues': 3,
    'reggae': 4,
    'classical': 5,
    'rock': 6,
    'hiphop': 7,
    'country': 8,
    'jazz': 9
}


dfWithGenresMappedToInts = data.copy()  # Create a copy of the original DataFrame
dfWithGenresMappedToInts['genre'] = dfWithGenresMappedToInts['genre'].map(mappingGenresToIntegers)

dfWithGenresMappedToInts.head()

Unnamed: 0,chroma_C,chroma_Csharp,chroma_D,chroma_Dsharp,chroma_E,chroma_F,chroma_Fsharp,chroma_G,chroma_Gsharp,chroma_A,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,0.391512,0.429697,0.524729,0.385698,0.36634,0.433856,0.445597,0.593311,0.438685,0.413015,...,0.164212,2.695709,1.318642,0.632315,0.957001,-1.566437,6.275709,7.807974,6.363476,0
1,0.544175,0.420254,0.424198,0.399732,0.300949,0.312941,0.363381,0.445987,0.342582,0.325424,...,5.130442,6.827752,3.071533,3.366904,-2.422404,-2.135466,-3.336569,-0.564991,-3.893568,0
2,0.344232,0.477342,0.461315,0.383294,0.430534,0.360271,0.595679,0.406584,0.354102,0.464613,...,1.111607,-2.048577,-4.366745,-3.73261,-3.398216,2.417332,1.289159,7.578645,4.909898,0
3,0.392071,0.428158,0.523789,0.385555,0.366227,0.433113,0.444056,0.591267,0.437267,0.413407,...,0.235207,2.670471,1.338333,0.699207,1.112011,-1.565463,6.195435,7.648764,6.243628,0
4,0.396111,0.442692,0.408399,0.486187,0.353345,0.326342,0.372,0.300399,0.334868,0.334352,...,-0.269821,-1.64258,-4.781828,-3.302547,-4.714705,-3.559506,-3.702941,-4.042264,0.333003,0


In [15]:
correlationMatrix = dfWithGenresMappedToInts.corr()
correlationWithGenre = correlationMatrix['genre']
print(correlationWithGenre)

chroma_C             -0.267631
chroma_Csharp        -0.281706
chroma_D             -0.229623
chroma_Dsharp        -0.256004
chroma_E             -0.234266
chroma_F             -0.240727
chroma_Fsharp        -0.259485
chroma_G             -0.261392
chroma_Gsharp        -0.263401
chroma_A             -0.179309
chroma_Asharp        -0.260394
chroma_B             -0.282890
chroma_allPitches    -0.356863
tonnetz_dim1          0.080835
tonnetz_dim2          0.085719
tonnetz_dim3         -0.038150
tonnetz_dim4         -0.003440
tonnetz_dim5         -0.055401
tonnetz_dim6          0.021939
tonnetz_allDims       0.080169
spectral_centroid    -0.381355
spectral_bandwidth   -0.343107
spectral_rolloff     -0.378688
rmse                 -0.189231
mfcc1                -0.322031
mfcc2                 0.310179
mfcc3                 0.015514
mfcc4                 0.020977
mfcc5                 0.042827
mfcc6                -0.126293
mfcc7                -0.043208
mfcc8                -0.173193
mfcc9   

In [19]:
#correlationWithGenre = correlationWithGenre.drop('genre')
mostPositive5 = correlationWithGenre.nlargest(5)
print(mostPositive5)

mfcc2           0.310179
mfcc13          0.132915
mfcc11          0.100913
tonnetz_dim2    0.085719
mfcc15          0.085100
Name: genre, dtype: float64


In [20]:
mostNegative5 = correlationWithGenre.nsmallest(5)
print(mostNegative5)

spectral_centroid    -0.381355
spectral_rolloff     -0.378688
chroma_allPitches    -0.356863
spectral_bandwidth   -0.343107
mfcc1                -0.322031
Name: genre, dtype: float64


In [None]:
2nd way - using .corr - got this from hw 2 q3