In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
# Utility functions
import pandas as pd
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

def split_data(dataframe):
  # Split data into features and target
  X = dataframe.drop(columns=['label', 'label_1'])
  y = dataframe['label']

  # Encode labels if they are strings
  if y.dtype == 'object':
      label_encoder = LabelEncoder()
      y = label_encoder.fit_transform(y)

  # Split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  return X_train, X_test, y_train, y_test


def train_classifier(X_train, X_test, y_train, classifier):
  # Train the classifier
  if classifier == 'SVM':
    classifier = SVC(kernel='rbf', decision_function_shape='ovr')
  else:
    classifier = KNeighborsClassifier()
  classifier.fit(X_train, y_train)

  # Predict on the test set
  return classifier.predict(X_test)


def train_on_all_features(dataframe, classifier):
  # Split data into train and test sets
  X_train, X_test, y_train, y_test = split_data(dataframe)

  y_pred = train_classifier(X_train, X_test, y_train, classifier)

  # Evaluate the classifier
  report = classification_report(y_test, y_pred)
  print("Classification Report:\n", report)



def train_on_single_features(dataframe, selected_features, classifier):
  # Split data into train and test sets
  X_train, X_test, y_train, y_test = split_data(dataframe)

  for subset_name, subset_regex in selected_features.items():
      # Filter columns based on regular expression patterns
      subset_X_train = X_train.filter(regex=subset_regex)
      subset_X_test = X_test.filter(regex=subset_regex)

      y_pred = train_classifier(subset_X_train, subset_X_test, y_train, classifier)

      # Calculate and print the accuracy for each classifier
      accuracy = accuracy_score(y_test, y_pred)
      print(f"Accuracy for {subset_name}: {accuracy}")

def string_to_list(string):
    try:
        print(string)
        # Remove '[' and ']' characters, then split the string by whitespace
        parts = string.strip('[]').replace(',', '').replace('j', '').replace('e', '').replace('+', ' ').replace('-', ' ').split()

        # Convert each part to a double and create a list
        result = [float(part) for part in parts]
        return result
    except ValueError as e:
        print(f"Error: {e}")
        return None

# Function to unpack list columns into multiple columns
def unpack_lists(df):
    for col in df.columns:
        if isinstance(df[col][0], list):
            # print(col)
            max_len = max(len(sublist) for sublist in df[col])
            for i in range(max_len):
                df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
            df.drop(columns=[col], inplace=True)
    return df


def compute_mean_std(col):
    means = col.apply(lambda x: np.mean(x) if isinstance(x, list) else x)
    stds = col.apply(lambda x: np.std(x) if isinstance(x, list) else x)
    return means, stds

def mean_and_std_from_list(df):
  for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, list)).any():
        # Compute mean and std if the column contains lists
        mean_col, std_col = compute_mean_std(df[col])
        df[f'{col}_mean'] = mean_col
        df[f'{col}_std'] = std_col
        df.drop(columns=[col], inplace=True)
    else:
        # Copy the column as-is if it does not contain lists
        df[col] = df[col]

  return df


def load_dataframe_from_csv(file_path, mean_and_std = False):
    df = pd.read_csv(file_path)

    df = df.dropna()

    transformed_df = df.copy()
    for column in df.columns:
        if column != 'label' and column != 'label_1':
            transformed_df[column] = df[column].apply(string_to_list)

    transformed_df = transformed_df.dropna()

    if mean_and_std:
      unpacked_df = mean_and_std_from_list(transformed_df)
    else:
      # Unpack the lists into separate columns
      unpacked_df = unpack_lists(transformed_df)

    unpacked_df = unpacked_df.dropna(axis=1, how='any')

    return unpacked_df

# Comparing classifiers on spectrum features and acoustic features

## Dataframe test

### Load dataframes test

In [4]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
acoustic_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan_acoustic_features.csv'
spectrum_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan_spectrum_texture_features.csv'
aim_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan_aim_texture_features.csv'

# Read the CSV file into a pandas DataFrame
af = pd.read_csv(acoustic_feature_file)
sf = pd.read_csv(spectrum_feature_file)
aim = pd.read_csv(aim_feature_file)

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
af.head()

Unnamed: 0,label_1,label,mfcc,ssd,rh,mvd,tssd,trh
0,pop_1,pop,"[-222.6293487548828, 109.60015106201172, 8.275...","[1.013566441466116, 1.8810141526842126, 2.1647...","[10.316460536043884, 16.805600024205884, 9.250...","[0.42985252233516186, 0.7002333343419118, 0.38...","[1.013566441466116, 1.8810141526842126, 2.1647...","[10.316460536043884, 16.805600024205884, 9.250..."
1,pop_2,pop,"[-14.67728042602539, 77.67280578613281, -0.129...","[3.3161911645993554, 5.175367302148178, 5.2249...","[19.991905650495276, 8.239650511032966, 9.3679...","[0.8329960687706365, 0.3433187712930403, 0.390...","[3.3161911645993554, 5.175367302148178, 5.2249...","[19.991905650495276, 8.239650511032966, 9.3679..."
2,pop_3,pop,"[-93.86768341064453, 43.44512176513672, -7.478...","[2.02241963837741, 3.3769240258079094, 3.98909...","[37.47621078068581, 10.006705764254962, 10.279...","[1.5615087825285754, 0.4169460735106234, 0.428...","[2.02241963837741, 3.3769240258079094, 3.98909...","[37.47621078068581, 10.006705764254962, 10.279..."
3,pop_4,pop,"[-152.0604705810547, 35.5008430480957, 30.3779...","[1.7117733249487153, 2.9075074643143615, 2.963...","[13.645965233051191, 8.265738314798698, 6.7350...","[0.5685818847104663, 0.3444057631166124, 0.280...","[1.7117733249487153, 2.9075074643143615, 2.963...","[13.645965233051191, 8.265738314798698, 6.7350..."
4,pop_5,pop,"[-217.27813720703125, 126.35074615478516, -20....","[1.3593559292472313, 2.04508792652116, 1.83619...","[17.419743302424763, 12.87252191253119, 8.6262...","[0.7258226376010318, 0.5363550796887996, 0.359...","[1.3593559292472313, 2.04508792652116, 1.83619...","[17.419743302424763, 12.87252191253119, 8.6262..."


In [None]:
sf.head()

Unnamed: 0,label_1,label,LBP,LBPHF,RICLBP,LPQ
0,reggae_1,reggae,[0.0716 0.11115333 0.05763 0.08357 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 28417, 14978, 17588, 26849, 26881, 45898, ...",[0.00261727 0.00611591 0.00160689 0.00119805 0...
1,reggae_2,reggae,[0.076365 0.10769 0.058775 0.08472333 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 28887, 15957, 18439, 25088, 31953, 47267, ...",[0.00212968 0.00470339 0.00158176 0.00105395 0...
2,reggae_3,reggae,[0.07215667 0.10451167 0.05877167 0.08574167 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 26571, 15024, 19260, 24963, 28487, 42734, ...",[0.01165709 0.00542557 0.00161192 0.00126172 0...
3,reggae_4,reggae,[0.07133 0.10719333 0.058595 0.08407167 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 27246, 14909, 17920, 25668, 28095, 42923, ...",[0.00457269 0.00670069 0.00163873 0.00127513 0...
4,reggae_5,reggae,[0.06144333 0.09379333 0.05081 0.078955 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 23046, 12904, 15486, 21692, 24766, 41992, ...",[0.10136829 0.00932802 0.00135723 0.00110422 0...


In [None]:
aim.head()

Unnamed: 0,label_1,label,LBP,LBPHF,RICLBP,LPQ
0,country_1,country,[0.00084394 0.00704893 0. 0.05100228 0...,[ 2. +0.j 0. +0.j ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[8.91337817e-05 6.25719148e-02 1.29649137e-04 ...
1,country_2,country,[0.0016477 0.01045685 0. 0.05960648 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[1.17494530e-04 5.36423304e-02 3.24122843e-04 ...
2,country_3,country,[0.00240725 0.01306906 0. 0.06346049 0...,[ 2. +0.00000000e+00j 0. +0.000...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[3.80844340e-04 4.58714853e-02 3.72741269e-04 ...
3,country_4,country,[0.00189686 0.01071406 0. 0.05537873 0...,[ 2. +0.j 0. +0.j ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[2.30937525e-04 5.52467385e-02 2.87659023e-04 ...
4,country_5,country,[0.0029538 0.0120282 0. 0.05998023 0...,[ 2. +0.j 0. +0.j ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[3.76792804e-04 4.93882181e-02 3.20071307e-04 ...


### Process dataframes

In [None]:
af = af.dropna()

transformed_af = af.copy()
for column in af.columns:
    if column != 'label' and column != 'label_1':
        transformed_af[column] = af[column].apply(string_to_list)

transformed_af = transformed_af.dropna()

# Unpack the lists into separate columns
unpacked_af = unpack_lists(transformed_af)

unpacked_af.head()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
sf = sf.dropna()

transformed_sf = sf.copy()
for column in sf.columns:
    if column != 'label' and column != 'label_1':
        transformed_sf[column] = sf[column].apply(string_to_list)

transformed_sf = transformed_sf.dropna()

# Unpack the lists into separate columns
unpacked_sf = unpack_lists(transformed_sf)

unpacked_sf.head()

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
 -0.27147483-2.34120036e-01j  0.        +0.00000000e+00j
 -0.02775334-1.72296980e-01j  0.        +0.00000000e+00j
 -0.08584   +2.77555756e-17j  0.        +0.00000000e+00j
 -0.02775334+1.72296980e-01j  0.        +0.00000000e+00j
 -0.27147483+2.34120036e-01j  0.        +0.00000000e+00j
 -0.12686333-2.90841766e-01j  0.        +0.00000000e+00j
  0.13036149-3.83280092e-01j  0.        +0.00000000e+00j]
[ 2.        +0.j          0.        +0.j          0.12804964+0.40264416j
  0.        +0.j         -0.14610827+0.29872197j  0.        +0.j
 -0.28802298-0.27607188j  0.        +0.j         -0.02919173-0.19621929j
  0.        +0.j         -0.06755333+0.j          0.        +0.j
 -0.02919173+0.19621929j  0.        +0.j         -0.28802298+0.27607188j
  0.        +0.j         -0.14610827-0.29872197j  0.        +0.j
  0.12804964-0.40264416j  0.        +0.j        ]
[ 2.        +0.00000000e+00j  0.        +0.00000000e+00j
  0.11530077+5.

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
 0.00486257 0.00268765 0.00356566 0.00547751 0.00689003 0.00341151
 0.00738936 0.00819867 0.00520606 0.00627342 0.0067208  0.00787863
 0.0075653  0.00814673 0.00863097 0.00913365 0.00474863 0.00539708
 0.00653146 0.0074279  0.00526639 0.00613267 0.00133712 0.0028904
 0.00123659 0.00118297 0.00198223 0.00396948 0.00252344 0.00187164
 0.00137399 0.00276473 0.00127513 0.00130696 0.0017577  0.00284516
 0.00265581 0.00204087 0.00227378 0.00165716 0.00167894 0.00298255
 0.00265246 0.0019839  0.00181131 0.00378181 0.00525131 0.00311828
 0.00250501 0.0043247  0.0072771  0.00388402 0.00298255 0.00513904
 0.00034852 0.00054122 0.0002949  0.000501   0.00058813 0.00078083
 0.00051106 0.00066186 0.00061829 0.00077245 0.0004876  0.00064008
 0.00171748 0.00365614 0.00256366]
[0.0019839  0.00571209 0.00177613 0.00110589 0.0007825  0.00074731
 0.00076742 0.00070542 0.00099865 0.00090817 0.00088304 0.0007825
 0.00045911 0.00050268 0.0004490

  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lamb

Unnamed: 0,label_1,label,LBP0,LBP1,LBP2,LBP3,LBP4,LBP5,LBP6,LBP7,...,LPQ500,LPQ501,LPQ502,LPQ503,LPQ504,LPQ505,LPQ506,LPQ507,LPQ508,LPQ509
0,reggae_1,reggae,0.0716,0.111153,0.05763,0.08357,0.093472,0.111418,0.068083,0.111858,...,,,,,,,,,,
1,reggae_2,reggae,0.076365,0.10769,0.058775,0.084723,0.091895,0.112508,0.068788,0.11223,...,,,,,,,,,,
2,reggae_3,reggae,0.072157,0.104512,0.058772,0.085742,0.090067,0.116915,0.069157,0.106552,...,,,,,,,,,,
3,reggae_4,reggae,0.07133,0.107193,0.058595,0.084072,0.089605,0.113775,0.069648,0.107643,...,,,,,,,,,,
4,reggae_5,reggae,0.061443,0.093793,0.05081,0.078955,0.083465,0.11157,0.061807,0.094005,...,,,,,,,,,,


In [None]:
aim = aim.dropna()

transformed_aim = aim.copy()
for column in aim.columns:
    if column != 'label' and column != 'label_1':
        transformed_aim[column] = aim[column].apply(string_to_list)

transformed_aim = transformed_aim.dropna()

# Unpack the lists into separate columns
unpacked_aim = unpack_lists(transformed_aim)

unpacked_aim.head()

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
  0.19344657-1.3115192j   0.        +0.j        ]
[ 2.        +0.00000000e+00j  0.        +0.00000000e+00j
  0.08638239+1.31406117e+00j  0.        +0.00000000e+00j
 -0.91768123+8.70754252e-01j  0.        +0.00000000e+00j
 -1.32753305-7.64448673e-01j  0.        +0.00000000e+00j
  0.77874412-1.33175274e+00j  0.        +0.00000000e+00j
  0.78742284+5.55111512e-17j  0.        +0.00000000e+00j
  0.77874412+1.33175274e+00j  0.        +0.00000000e+00j
 -1.32753305+7.64448673e-01j  0.        +0.00000000e+00j
 -0.91768123-8.70754252e-01j  0.        +0.00000000e+00j
  0.08638239-1.31406117e+00j  0.        +0.00000000e+00j]
[ 2.        +0.j          0.        +0.j          0.06449516+1.31457948j
  0.        +0.j         -0.84696394+0.87291236j  0.        +0.j
 -1.40520696-0.76769523j  0.        +0.j          0.80395501-1.33997936j
  0.        +0.j          0.79710005+0.j          0.        +0.j
  0.80395501+1.33997936j  0.        +0.

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
 0.00000000e+00 2.02576777e-04 5.26699619e-04 0.00000000e+00
 0.00000000e+00 5.55060368e-04 3.97050482e-04 4.29462766e-04
 8.50822462e-04 7.21173325e-04 0.00000000e+00 3.40328985e-04
 9.49274775e-03 8.18410177e-04 0.00000000e+00 6.96864111e-04
 1.45855279e-04 5.10493477e-04 0.00000000e+00 1.25597601e-04
 2.02576777e-05 5.95575723e-04 0.00000000e+00 1.90422170e-04
 4.69978122e-04 1.55578964e-03 0.00000000e+00 4.45668909e-04
 9.15647030e-04 5.82853902e-02 5.91524188e-04]
[2.79555952e-04 6.04772709e-02 3.56535127e-04 1.58009886e-04
 1.97714934e-03 2.26885990e-04 3.76792804e-04 6.48245685e-05
 3.24122843e-04 1.25597601e-03 3.64638198e-04 0.00000000e+00
 6.48245685e-04 2.18782919e-03 0.00000000e+00 1.53958350e-04
 8.34616320e-04 1.14253302e-03 1.64897496e-03 1.53958350e-04
 8.68244065e-03 7.17932096e-03 6.56348756e-04 2.81986873e-03
 1.25597601e-04 6.48245685e-05 3.64638198e-05 2.15946844e-03
 5.46957297e-04 4.05153553e-06 0.00

  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else None)
  df[f'{col}{i}'] = df[col].apply(lamb

Unnamed: 0,label_1,label,LBP0,LBP1,LBP2,LBP3,LBP4,LBP5,LBP6,LBP7,...,LPQ500,LPQ501,LPQ502,LPQ503,LPQ504,LPQ505,LPQ506,LPQ507,LPQ508,LPQ509
0,country_1,country,0.000844,0.007049,0.0,0.051002,0.0,0.185876,0.0,0.039452,...,0.0,0.0,3.038652,4.0,8.224617,4.0,6.817924,2.0,2.87659,4.0
1,country_2,country,0.001648,0.010457,0.0,0.059606,0.0,0.165063,0.0,0.040746,...,0.0,0.0,4.780812,4.0,9.926262,4.0,5.756017,2.0,3.281744,4.0
2,country_3,country,0.002407,0.013069,0.0,0.06346,0.0,0.147252,0.0,0.042398,...,0.0,0.0,5.064419,4.0,1.203306,3.0,4.929503,2.0,4.456689,4.0
3,country_4,country,0.001897,0.010714,0.0,0.055379,0.0,0.16702,0.0,0.038789,...,0.0,0.0,3.889474,4.0,8.791832,4.0,5.988575,2.0,3.686897,4.0
4,country_5,country,0.002954,0.012028,0.0,0.05998,0.0,0.155896,0.0,0.041345,...,0.0,0.0,4.456689,4.0,1.13443,3.0,5.378413,2.0,4.861843,4.0


In [None]:
rows_with_nan = unpacked_af[unpacked_af[unpacked_af.columns].isna().any(axis=1)]
print("Rows with NaN values in some columns:")
print(rows_with_nan)

len(rows_with_nan)

Rows with NaN values in some columns:
Empty DataFrame
Columns: [label_1, label, mfcc0, mfcc1, mfcc2, mfcc3, mfcc4, mfcc5, mfcc6, mfcc7, mfcc8, mfcc9, mfcc10, mfcc11, mfcc12, mfcc13, mfcc14, mfcc15, mfcc16, mfcc17, mfcc18, mfcc19, mfcc20, mfcc21, mfcc22, mfcc23, mfcc24, mfcc25, mfcc26, mfcc27, mfcc28, mfcc29, mfcc30, mfcc31, mfcc32, mfcc33, mfcc34, mfcc35, mfcc36, mfcc37, mfcc38, mfcc39, mfcc40, mfcc41, mfcc42, mfcc43, mfcc44, mfcc45, mfcc46, mfcc47, mfcc48, mfcc49, mfcc50, mfcc51, ssd0, ssd1, ssd2, ssd3, ssd4, ssd5, ssd6, ssd7, ssd8, ssd9, ssd10, ssd11, ssd12, ssd13, ssd14, ssd15, ssd16, ssd17, ssd18, ssd19, ssd20, ssd21, ssd22, ssd23, ssd24, ssd25, ssd26, ssd27, ssd28, ssd29, ssd30, ssd31, ssd32, ssd33, ssd34, ssd35, ssd36, ssd37, ssd38, ssd39, ssd40, ssd41, ssd42, ssd43, ssd44, ssd45, ...]
Index: []

[0 rows x 2298 columns]


0

In [None]:
unpacked_af = unpacked_af.dropna(axis=1, how='any')
# unpacked_af = unpacked_af.dropna()


In [None]:
unpacked_af.head()

Unnamed: 0,label_1,label,mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,...,trh410,trh411,trh412,trh413,trh414,trh415,trh416,trh417,trh418,trh419
0,pop_1,pop,222.629349,109.600151,8.275423,2.573467,9.491745,11.674646,15.712113,4.138751,...,0.772719,0.645714,0.909812,0.605699,0.934178,0.750894,0.609174,1.134917,1.148288,0.850899
1,pop_2,pop,14.67728,77.672806,0.129021,22.647038,17.026323,3.309944,9.159575,2.875973,...,1.808488,2.761815,2.086776,1.823407,1.785082,1.444638,1.830156,2.368714,5.49322,4.114556
2,pop_3,pop,93.867683,43.445122,7.478451,0.088349,6.007252,4.361706,0.094539,3.452164,...,3.795983,3.241748,2.975564,2.030982,1.273109,1.783881,2.051244,1.616951,2.305112,1.510026
3,pop_4,pop,152.060471,35.500843,30.377958,14.888762,21.653246,0.808262,3.577241,0.581427,...,0.966207,0.977252,1.350585,1.510566,1.535846,4.028608,4.373429,1.372121,1.21089,1.372938
4,pop_5,pop,217.278137,126.350746,20.07049,19.959452,1.837735,10.51269,0.844215,8.748989,...,1.240624,1.036558,0.936757,0.785368,0.863049,0.749478,0.842445,0.500647,0.502126,0.579084


In [None]:
rows_with_nan = unpacked_sf[unpacked_sf[unpacked_sf.columns].isna().any(axis=1)]
print("Rows with NaN values in some columns:")
print(rows_with_nan)

len(rows_with_nan)

Rows with NaN values in some columns:
Empty DataFrame
Columns: [label_1, label, LBP0, LBP1, LBP2, LBP3, LBP4, LBP5, LBP6, LBP7, LBP8, LBP9, LBP10, LBP11, LBP12, LBP13, LBP14, LBP15, LBP16, LBP17, LBP18, LBP19, LBPHF0, LBPHF1, LBPHF2, LBPHF3, LBPHF4, LBPHF5, LBPHF6, LBPHF7, LBPHF8, LBPHF9, LBPHF10, LBPHF11, LBPHF12, LBPHF13, LBPHF14, LBPHF15, LBPHF16, LBPHF17, LBPHF18, LBPHF19, LBPHF20, LBPHF21, LBPHF22, LBPHF23, LBPHF24, LBPHF25, LBPHF26, LBPHF27, LBPHF28, LBPHF29, LBPHF30, LBPHF31, LBPHF32, LBPHF33, LBPHF34, LBPHF35, LBPHF36, LBPHF37, LBPHF38, LBPHF39, RICLBP0, RICLBP1, RICLBP2, RICLBP3, RICLBP4, RICLBP5, RICLBP6, RICLBP7, RICLBP8, RICLBP9, RICLBP10, RICLBP11, RICLBP12, RICLBP13, RICLBP14, RICLBP15, RICLBP16, RICLBP17, RICLBP18, RICLBP19, RICLBP20, RICLBP21, RICLBP22, RICLBP23, RICLBP24, RICLBP25, RICLBP26, RICLBP27, RICLBP28, RICLBP29, RICLBP30, RICLBP31, RICLBP32, RICLBP33, RICLBP34, RICLBP35, RICLBP36, RICLBP37, ...]
Index: []

[0 rows x 453 columns]


0

In [None]:
unpacked_sf = unpacked_sf.dropna(axis=1, how='any')
# unpacked_sf = unpacked_sf.dropna()

In [None]:
unpacked_sf.head()

Unnamed: 0,label,LBP0,LBP1,LBP2,LBP3,LBP4,LBP5,LBP6,LBP7,LBP8,...,LPQ245,LPQ246,LPQ247,LPQ248,LPQ249,LPQ250,LPQ251,LPQ252,LPQ253,LPQ254
0,reggae,0.0716,0.111153,0.05763,0.08357,0.093472,0.111418,0.068083,0.111858,0.140412,...,0.000861,0.000677,0.000741,0.000618,0.000861,0.000538,0.000628,0.001763,0.005286,0.003167
1,reggae,0.076365,0.10769,0.058775,0.084723,0.091895,0.112508,0.068788,0.11223,0.127197,...,0.000972,0.000675,0.000868,0.0007,0.000772,0.000628,0.00065,0.002192,0.00341,0.002946
2,reggae,0.072157,0.104512,0.058772,0.085742,0.090067,0.116915,0.069157,0.106552,0.14317,...,0.00095,0.000597,0.00085,0.000633,0.000829,0.000568,0.000712,0.002148,0.00382,0.003187
3,reggae,0.07133,0.107193,0.058595,0.084072,0.089605,0.113775,0.069648,0.107643,0.145628,...,0.000866,0.000695,0.000796,0.000615,0.000856,0.000553,0.000746,0.001823,0.006439,0.003014
4,reggae,0.061443,0.093793,0.05081,0.078955,0.083465,0.11157,0.061807,0.094005,0.230853,...,0.000767,0.00063,0.000722,0.000523,0.000756,0.000479,0.00065,0.00153,0.007081,0.002703


In [None]:
rows_with_nan = unpacked_aim[unpacked_aim[unpacked_aim.columns].isna().any(axis=1)]
print("Rows with NaN values in some columns:")
print(rows_with_nan)

len(rows_with_nan)

Rows with NaN values in some columns:
Empty DataFrame
Columns: [label_1, label, LBP0, LBP1, LBP2, LBP3, LBP4, LBP5, LBP6, LBP7, LBP8, LBP9, LBP10, LBP11, LBP12, LBP13, LBP14, LBP15, LBP16, LBP17, LBP18, LBP19, LBPHF0, LBPHF1, LBPHF2, LBPHF3, LBPHF4, LBPHF5, LBPHF6, LBPHF7, LBPHF8, LBPHF9, LBPHF10, LBPHF11, LBPHF12, LBPHF13, LBPHF14, LBPHF15, LBPHF16, LBPHF17, LBPHF18, LBPHF19, LBPHF20, LBPHF21, LBPHF22, LBPHF23, LBPHF24, LBPHF25, LBPHF26, LBPHF27, LBPHF28, LBPHF29, LBPHF30, LBPHF31, LBPHF32, LBPHF33, LBPHF34, LBPHF35, LBPHF36, LBPHF37, LBPHF38, LBPHF39, RICLBP0, RICLBP1, RICLBP2, RICLBP3, RICLBP4, RICLBP5, RICLBP6, RICLBP7, RICLBP8, RICLBP9, RICLBP10, RICLBP11, RICLBP12, RICLBP13, RICLBP14, RICLBP15, RICLBP16, RICLBP17, RICLBP18, RICLBP19, RICLBP20, RICLBP21, RICLBP22, RICLBP23, RICLBP24, RICLBP25, RICLBP26, RICLBP27, RICLBP28, RICLBP29, RICLBP30, RICLBP31, RICLBP32, RICLBP33, RICLBP34, RICLBP35, RICLBP36, RICLBP37, ...]
Index: []

[0 rows x 708 columns]


0

In [None]:
unpacked_aim = unpacked_aim.dropna(axis=1, how='any')
# unpacked_aim = unpacked_aim.dropna()


In [None]:
unpacked_aim.head()

Unnamed: 0,label_1,label,mfcc0,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,...,trh410,trh411,trh412,trh413,trh414,trh415,trh416,trh417,trh418,trh419
0,pop_1,pop,222.629349,109.600151,8.275423,2.573467,9.491745,11.674646,15.712113,4.138751,...,0.772719,0.645714,0.909812,0.605699,0.934178,0.750894,0.609174,1.134917,1.148288,0.850899
1,pop_2,pop,14.67728,77.672806,0.129021,22.647038,17.026323,3.309944,9.159575,2.875973,...,1.808488,2.761815,2.086776,1.823407,1.785082,1.444638,1.830156,2.368714,5.49322,4.114556
2,pop_3,pop,93.867683,43.445122,7.478451,0.088349,6.007252,4.361706,0.094539,3.452164,...,3.795983,3.241748,2.975564,2.030982,1.273109,1.783881,2.051244,1.616951,2.305112,1.510026
3,pop_4,pop,152.060471,35.500843,30.377958,14.888762,21.653246,0.808262,3.577241,0.581427,...,0.966207,0.977252,1.350585,1.510566,1.535846,4.028608,4.373429,1.372121,1.21089,1.372938
4,pop_5,pop,217.278137,126.350746,20.07049,19.959452,1.837735,10.51269,0.844215,8.748989,...,1.240624,1.036558,0.936757,0.785368,0.863049,0.749478,0.842445,0.500647,0.502126,0.579084


## Load dataframes

In [30]:
acoustic_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_acoustic_features.csv'
spectrum_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_spectrum_texture_features.csv'
aim_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_aim_texture_features.csv'

In [None]:
af = load_dataframe_from_csv(acoustic_feature_file)
# af = load_dataframe_from_csv(acoustic_feature_file, True)
af.head()

In [None]:
sf = load_dataframe_from_csv(spectrum_feature_file)
# sf = load_dataframe_from_csv(spectrum_feature_file, True)
sf.head()

In [None]:
aim = load_dataframe_from_csv(aim_feature_file)
# aim = load_dataframe_from_csv(aim_feature_file, True)
aim.head()

## Train SVM and KNN classifier both on the single features and on the full features to see which classifier perform the best

### Acoustic Features

In [None]:
column_subsets = {
      'mfcc': 'mfcc.*',
      'ssd': 'ssd.*',
      'rh': 'rh.*',
      'mvd': 'mvd.*',
      'tssd': 'tssd.*',
      'trh': 'trh.*'
  }

print('Single features approach:')
print('SVM results:')
train_on_single_features(af, column_subsets, 'SVM')

print('\nKNN results:')
train_on_single_features(af, column_subsets, 'KNN')

print('\n\nFull features approach:')
print('SVM results:')
train_on_all_features(af, 'SVM')

print('\nKNN results:')
train_on_all_features(af, 'KNN')

Single features approach:
SVM results:
Accuracy for mfcc: 0.5545927209705372
Accuracy for ssd: 0.622472559214327
Accuracy for rh: 0.6051415366839977
Accuracy for mvd: 0.5410167533217793
Accuracy for tssd: 0.6192952050837666
Accuracy for trh: 0.6028307336799538

KNN results:
Accuracy for mfcc: 0.6395147313691508
Accuracy for ssd: 0.7715193529751588
Accuracy for rh: 0.6510687463893703
Accuracy for mvd: 0.4832466782206817
Accuracy for tssd: 0.7718082033506644
Accuracy for trh: 0.6510687463893703


Full features approach:
SVM results:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.89      0.78      1100
           1       0.43      0.78      0.56       747
           2       0.00      0.00      0.00       101
           3       0.00      0.00      0.00       175
           4       0.00      0.00      0.00        42
           5       0.00      0.00      0.00        55
           6       0.00      0.00      0.00       399
           7       0.54      0.46      0.50       843

    accuracy                           0.56      3462
   macro avg       0.21      0.27      0.23      3462
weighted avg       0.45      0.56      0.49      3462


KNN results:
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.89      0.80      1100
           1       0.63      0.70      0.66       747
           2       0.46      0.33      0.38       101
           3       0.63      0.50      0.56       175
           4    

### Spectrum Features

In [None]:
column_subsets = {
      'LBP': 'LBP.*',
      'LBPHF': 'LBPHF.*',
      'RICLBP': 'RICLBP.*',
      'LPQ': 'LPQ.*'
  }

print('Single features approach:')
print('SVM results:')
train_on_single_features(sf, column_subsets, 'SVM')

print('\nKNN results:')
train_on_single_features(sf, column_subsets, 'KNN')

print('\n\nFull features approach:')
print('SVM results:')
train_on_all_features(sf, 'SVM')

print('\nKNN results:')
train_on_all_features(sf, 'KNN')

Single features approach:
SVM results:
Accuracy for LBP: 0.591892820336654
Accuracy for LBPHF: 0.4991411885949845
Accuracy for RICLBP: 0.5929233940226726
Accuracy for LPQ: 0.36585365853658536

KNN results:
Accuracy for LBP: 0.7320508416351769
Accuracy for LBPHF: 0.6032291308828581
Accuracy for RICLBP: 0.7320508416351769
Accuracy for LPQ: 0.7299896942631399


Full features approach:
SVM results:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.71      0.97      0.82      1024
           2       0.56      0.73      0.63       767
           3       0.00      0.00      0.00       101
           4       0.00      0.00      0.00       174
           5       0.00      0.00      0.00        35
           6       0.00      0.00      0.00        50
           7       0.32      0.33      0.32       419
           8       0.55      0.10      0.17       340

    accuracy                           0.59      2911
   macro avg       0.24      0.24      0.22      2911
weighted avg       0.50      0.59      0.52      2911


KNN results:
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.78      0.96      0.86      1024
           2       0.77      0.74      0.75       767
           3    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Aim features

In [None]:
column_subsets = {
      'LBP': 'LBP.*',
      'LBPHF': 'LBPHF.*',
      'RICLBP': 'RICLBP.*',
      'LPQ': 'LPQ.*'
  }

print('Single features approach:')
print('SVM results:')
train_on_single_features(aim, column_subsets, 'SVM')

print('\nKNN results:')
train_on_single_features(aim, column_subsets, 'KNN')

print('\n\nFull features approach:')
print('SVM results:')
train_on_all_features(aim, 'SVM')

print('\nKNN results:')
train_on_all_features(aim, 'KNN')

### Features combined

In [None]:
full_df = pd.merge(af, sf, on='label_1', how='inner')
full_df = full_df.drop(columns=['label_y'])
full_df = full_df.rename(columns={'label_x': 'label'})

full_df = pd.merge(full_df, aim, on='label_1', how='inner')
full_df = full_df.drop(columns=['label_y'])
full_df = full_df.rename(columns={'label_x': 'label'})

In [None]:
print('SVM results:')
train_on_all_features(full_df, 'SVM')

print('\nKNN results:')
train_on_all_features(full_df, 'KNN')

SVM results:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           0       0.28      0.43      0.34        67
           1       0.62      0.20      0.31        64
           2       0.00      0.00      0.00        63
           3       0.00      0.00      0.00        58
           4       0.00      0.00      0.00        58
           5       0.00      0.00      0.00        51
           6       0.29      0.15      0.20        59
           7       0.30      1.00      0.46        72
           8       0.00      0.00      0.00        56
           9       0.12      0.44      0.19        50

    accuracy                           0.24       598
   macro avg       0.16      0.22      0.15       598
weighted avg       0.17      0.24      0.16       598


KNN results:
Classification Report:
               precision    recall  f1-score   support

           0       0.29      0.40      0.34        67
           1       0.44      0.47      0.45        64
           2    

In [None]:
acoustic_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_acoustic_features.csv'
spectrum_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_spectrum_texture_features.csv'
aim_feature_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan_aim_texture_features.csv'

import csv

def add_counter_to_csv(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        current_second_element = None
        counter = 0

        for row in reader:
            second_element = row[0]  # Assuming second element is at index 1

            if second_element != current_second_element:
                current_second_element = second_element
                counter = 1
            else:
                counter += 1

            new_row = [f"{second_element}_{counter}"] + row
            writer.writerow(new_row)


output_file = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_acoustic_features_1.csv'
output_file1 = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/ismir2004_spectrum_texture_features_1.csv'
output_file2 = '/content/drive/MyDrive/Università/NAML_project/datasets_processed/gtzan_aim_texture_features_1.csv'

# add_counter_to_csv(acoustic_feature_file, output_file)
# add_counter_to_csv(spectrum_feature_file, output_file1)
add_counter_to_csv(aim_feature_file, output_file2)