In [1]:
import sklearn
import numpy as np
import pandas as pd
import os
from SoundLights.features_groups import  ARAUS_features

normalise = lambda X: (X-np.mean(X,axis=0,keepdims=True))/np.std(X,axis=0,keepdims=True) # Normalise an (n,p) numpy array to mean 0, variance 1.
clip = lambda x, x_min = -1, x_max = 1: np.where(np.where(x < x_min,x_min,x) > x_max, x_max, np.where(x < x_min,x_min,x)) # Clip an array to values between x_min and x_max.

## 0) PREPARE DATA

In [2]:
responses= pd.read_csv(os.path.join('..','data','SoundLights_ARAUS.csv'), dtype = {'info.participant':str}) #, dtype = {'participant':str}
responses=responses.drop("info.file", axis=1)
responses=responses.drop("info.participant", axis=1)

# Drop columns that contain all zero values
# Store column names before dropping
columns_before = responses.columns.tolist()
# Drop zero-columns
responses = responses.loc[:, (responses != 0).any(axis=0)]
# Store column names after dropping
columns_after = responses.columns.tolist()
# Determine which columns were dropped
columns_dropped = [col for col in columns_before if col not in columns_after]
# Drop those columns from ARAUS_features
ARAUS_features = [col for col in ARAUS_features if col not in columns_dropped]


responses.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24440 entries, 0 to 24439
Columns: 138 entries, info.fold to ARAUS.energy_frequency.20000_0
dtypes: float64(118), int64(18), object(2)
memory usage: 25.7+ MB


## 1) PLEASANTNESS

In [22]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=150)
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |    Mean absolute error   ||       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------||-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  || Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.P_ground_truth'].values
    Y_val = df_val['info.P_ground_truth'].values
    Y_test = df_test['info.P_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} || {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()

Investigating performance of KNeighborsRegressor(n_neighbors=150) model...
     |    Mean squared error    |    Mean absolute error   ||       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------||-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  || Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.1253 | 0.1392 | 0.0870 | 0.2880 | 0.3043 | 0.2387 || 19160 |  5040 |  48  |  113  |
   2 | 0.1267 | 0.1301 | 0.0941 | 0.2899 | 0.2921 | 0.2462 || 19160 |  5040 |  48  |  113  |
   3 | 0.1237 | 0.1471 | 0.1009 | 0.2861 | 0.3120 | 0.2575 || 20160 |  4040 |  48  |  113  |
   4 | 0.1237 | 0.1418 | 0.0991 | 0.2853 | 0.3115 | 0.2562 || 19160 |  5040 |  48  |  113  |
   5 | 0.1254 | 0.1367 | 0.0964 | 0.2878 | 0.3020 | 0.2578 || 19160 |  5040 |  48  |  113  |
Mean | 0.1250 | 0.1390 | 0.0955 | 0.2874 | 0.3044 | 0.2

K=5
Mean | 0.0912 | 0.1634 | 0.1202 | 0.2412 | 0.3279 | 0.2796 |
K=10
Mean | 0.1010 | 0.1496 | 0.1088 | 0.2553 | 0.3144 | 0.2653 |
K=15
Mean | 0.1056 | 0.1465 | 0.1071 | 0.2614 | 0.3116 | 0.2638 |
K=20
Mean | 0.1087 | 0.1441 | 0.1028 | 0.2655 | 0.3090 | 0.2596 |
K=25
Mean | 0.1107 | 0.1426 | 0.1038 | 0.2682 | 0.3076 | 0.2600 |
K=30
Mean | 0.1123 | 0.1418 | 0.1022 | 0.2704 | 0.3069 | 0.2606 |
K=40
Mean | 0.1146 | 0.1406 | 0.1011 | 0.2734 | 0.3054 | 0.2588 |
K=50
Mean | 0.1163 | 0.1401 | 0.1036 | 0.2756 | 0.3051 | 0.2625 |
K=60
Mean | 0.1178 | 0.1397 | 0.1030 | 0.2774 | 0.3049 | 0.2622 |
K=70
Mean | 0.1190 | 0.1396 | 0.1015 | 0.2791 | 0.3047 | 0.2603 |
K=80
Mean | 0.1201 | 0.1394 | 0.0994 | 0.2805 | 0.3045 | 0.2569 |
K=90
Mean | 0.1209 | 0.1395 | 0.0969 | 0.2817 | 0.3046 | 0.2539 |
K=100
Mean | 0.1218 | 0.1395 | 0.0963 | 0.2829 | 0.3046 | 0.2536 |
K=120
Mean | 0.1232 | 0.1393 | 0.0952 | 0.2849 | 0.3046 | 0.2523 | ---best
K=150
Mean | 0.1250 | 0.1390 | 0.0955 | 0.2874 | 0.3044 | 0.2513 | ---or this, best

K=10, "distance"
Mean | 0.0142 | 0.1496 | 0.1086 | 0.0385 | 0.3144 | 0.2652 | --- overfitting
K=100, "distance"
Mean | 0.0136 | 0.1393 | 0.0966 | 0.0378 | 0.3045 | 0.2538 | --- overfitting
