In [2]:
import sklearn.linear_model
import numpy as np
import pandas as pd
import os
from SoundLights.features_groups import  ARAUS_features

normalise = lambda X: (X-np.mean(X,axis=0,keepdims=True))/np.std(X,axis=0,keepdims=True) # Normalise an (n,p) numpy array to mean 0, variance 1.
clip = lambda x, x_min = -1, x_max = 1: np.where(np.where(x < x_min,x_min,x) > x_max, x_max, np.where(x < x_min,x_min,x)) # Clip an array to values between x_min and x_max.

## 0) PREPARE DATA

In [3]:
responses= pd.read_csv(os.path.join('..','data','SoundLights_ARAUS.csv'), dtype = {'info.participant':str}) #, dtype = {'participant':str}
responses=responses.drop("info.file", axis=1)
responses=responses.drop("info.participant", axis=1)

# Drop columns that contain all zero values
# Store column names before dropping
columns_before = responses.columns.tolist()
# Drop zero-columns
responses = responses.loc[:, (responses != 0).any(axis=0)]
# Store column names after dropping
columns_after = responses.columns.tolist()
# Determine which columns were dropped
columns_dropped = [col for col in columns_before if col not in columns_after]
# Drop those columns from ARAUS_features
ARAUS_features = [col for col in ARAUS_features if col not in columns_dropped]


responses.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24440 entries, 0 to 24439
Columns: 138 entries, info.fold to ARAUS.energy_frequency.20000_0
dtypes: float64(118), int64(18), object(2)
memory usage: 25.7+ MB


## 1) PLEASANTNESS

### 1.1) Trying parameters - version 1

In [8]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0, oob_score=True)
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |    Mean absolute error   ||       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------||-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  || Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.P_ground_truth'].values
    Y_val = df_val['info.P_ground_truth'].values
    Y_test = df_test['info.P_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} || {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()

Investigating performance of RandomForestRegressor(oob_score=True, random_state=0) model...
     |    Mean squared error    |    Mean absolute error   ||       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------||-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  || Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.0282 | 0.1403 | 0.0661 | 0.1255 | 0.3058 | 0.2193 || 19160 |  5040 |  48  |  113  |
   2 | 0.0281 | 0.1286 | 0.0837 | 0.1256 | 0.2896 | 0.2364 || 19160 |  5040 |  48  |  113  |
   3 | 0.0274 | 0.1415 | 0.0766 | 0.1246 | 0.3045 | 0.2334 || 20160 |  4040 |  48  |  113  |
   4 | 0.0276 | 0.1403 | 0.0778 | 0.1241 | 0.3108 | 0.2312 || 19160 |  5040 |  48  |  113  |
   5 | 0.0276 | 0.1417 | 0.0653 | 0.1248 | 0.3070 | 0.2080 || 19160 |  5040 |  48  |  113  |
Mean | 0.0278 | 0.1385 | 0.0739 | 0.12

### 1.2) Trying parameters - version 2

In [10]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=15, random_state=0, oob_score=True)
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |    Mean absolute error   ||       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------||-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  || Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.P_ground_truth'].values
    Y_val = df_val['info.P_ground_truth'].values
    Y_test = df_test['info.P_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} || {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()

Investigating performance of RandomForestRegressor(n_estimators=15, oob_score=True, random_state=0) model...
     |    Mean squared error    |    Mean absolute error   ||       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------||-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  || Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------


  warn(


   1 | 0.0317 | 0.1493 | 0.0713 | 0.1316 | 0.3146 | 0.2271 || 19160 |  5040 |  48  |  113  |


  warn(


   2 | 0.0318 | 0.1348 | 0.0932 | 0.1317 | 0.2967 | 0.2368 || 19160 |  5040 |  48  |  113  |


  warn(


   3 | 0.0310 | 0.1486 | 0.0829 | 0.1307 | 0.3112 | 0.2405 || 20160 |  4040 |  48  |  113  |


  warn(


   4 | 0.0311 | 0.1502 | 0.0971 | 0.1300 | 0.3201 | 0.2510 || 19160 |  5040 |  48  |  113  |
   5 | 0.0311 | 0.1462 | 0.0963 | 0.1304 | 0.3112 | 0.2604 || 19160 |  5040 |  48  |  113  |
Mean | 0.0313 | 0.1458 | 0.0881 | 0.1309 | 0.3108 | 0.2431 |



  warn(


## 2) EVENTFULNESS

In [11]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=15, random_state=0, oob_score=True)
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.E_ground_truth'].values
    Y_val = df_val['info.E_ground_truth'].values
    Y_test = df_test['info.E_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f}  | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d}|')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()

Investigating performance of RandomForestRegressor(n_estimators=15, oob_score=True, random_state=0) model...
     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------


  warn(


   1 | 0.0337 | 0.1403 | 0.0437 | 0.1345 | 0.3060 | 0.1740  | 19160 |  5040 |  48  |  113 |


  warn(


   2 | 0.0340 | 0.1401 | 0.0472 | 0.1347 | 0.3046 | 0.1748  | 19160 |  5040 |  48  |  113 |


  warn(


   3 | 0.0329 | 0.1375 | 0.0419 | 0.1324 | 0.3042 | 0.1628  | 20160 |  4040 |  48  |  113 |


  warn(


   4 | 0.0330 | 0.1408 | 0.0528 | 0.1332 | 0.3083 | 0.1840  | 19160 |  5040 |  48  |  113 |
   5 | 0.0331 | 0.1532 | 0.0548 | 0.1332 | 0.3228 | 0.1951  | 19160 |  5040 |  48  |  113 |
Mean | 0.0333 | 0.1424 | 0.0481 | 0.1336 | 0.3092 | 0.1782 |



  warn(
