### Import relevant packages

In [1]:
import sklearn.linear_model
import numpy as np
import pandas as pd
import os
from SoundLights.features_groups import  ARAUS_features

clip = lambda x, x_min = -1, x_max = 1: np.where(np.where(x < x_min,x_min,x) > x_max, x_max, np.where(x < x_min,x_min,x)) # Clip an array to values between x_min and x_max.

### Load data files for responses

In [2]:
responses_ARAUS= pd.read_csv(os.path.join('..','data','SoundLights_ARAUS.csv'), dtype = {'info.participant':str}) #, dtype = {'participant':str}
responses_ARAUS=responses_ARAUS.drop("info.file", axis=1)
responses_ARAUS=responses_ARAUS.drop("info.participant", axis=1)

# Drop columns that contain all zero values
# Store column names before dropping
columns_before = responses_ARAUS.columns.tolist()
# Drop zero-columns
responses_ARAUS = responses_ARAUS.loc[:, (responses_ARAUS != 0).any(axis=0)]
# Store column names after dropping
columns_after = responses_ARAUS.columns.tolist()
# Determine which columns were dropped
columns_dropped = [col for col in columns_before if col not in columns_after]
# Drop those columns from ARAUS_features
ARAUS_features = [col for col in ARAUS_features if col not in columns_dropped]
print(ARAUS_features)

['ARAUS.sharpness.avg', 'ARAUS.sharpness.max', 'ARAUS.sharpness.p05', 'ARAUS.sharpness.p10', 'ARAUS.sharpness.p20', 'ARAUS.sharpness.p30', 'ARAUS.sharpness.p40', 'ARAUS.sharpness.p50', 'ARAUS.sharpness.p60', 'ARAUS.sharpness.p70', 'ARAUS.sharpness.p80', 'ARAUS.sharpness.p90', 'ARAUS.sharpness.p95', 'ARAUS.loudness.avg', 'ARAUS.loudness.max', 'ARAUS.loudness.p05', 'ARAUS.loudness.p10', 'ARAUS.loudness.p20', 'ARAUS.loudness.p30', 'ARAUS.loudness.p40', 'ARAUS.loudness.p50', 'ARAUS.loudness.p60', 'ARAUS.loudness.p70', 'ARAUS.loudness.p80', 'ARAUS.loudness.p90', 'ARAUS.loudness.p95', 'ARAUS.fluctuation.avg', 'ARAUS.fluctuation.max', 'ARAUS.fluctuation.p05', 'ARAUS.fluctuation.p10', 'ARAUS.fluctuation.p20', 'ARAUS.fluctuation.p30', 'ARAUS.fluctuation.p40', 'ARAUS.fluctuation.p50', 'ARAUS.fluctuation.p60', 'ARAUS.fluctuation.p70', 'ARAUS.fluctuation.p80', 'ARAUS.fluctuation.p90', 'ARAUS.fluctuation.p95', 'ARAUS.LA.avg', 'ARAUS.LA.min', 'ARAUS.LA.max', 'ARAUS.LA.p05', 'ARAUS.LA.p10', 'ARAUS.LA

## Dummy "label mean" model

Now, we compute the relevant metrics for a dummy "label mean" model, which just predicts the mean of the ground-truth labels in its training set no matter the input (Regardless of the input feature, the dummy "label mean" model always predicts the calculated mean value)

In [3]:
model = sklearn.linear_model.ElasticNet()
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |        Mean  error       |       # samples      | #     |')
print('Fold |--------+--------+--------|--------+--------+--------|-------+-------+------| feat- |')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  | Train |  Val  | Test | ures  |')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses_ARAUS[(responses_ARAUS['info.fold'] != val_fold) & (responses_ARAUS['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses_ARAUS[responses_ARAUS['info.fold'] == val_fold]
    df_test  = responses_ARAUS[responses_ARAUS['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.E_ground_truth'].values
    Y_val = df_val['info.E_ground_truth'].values
    Y_test = df_test['info.E_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # This will be the model's prediction for all unseen data (output for any input)
    y_hat = df_train['info.E_ground_truth'].values.mean() 


    # Get MSEs
    MSE_train = np.mean((y_hat - Y_train)**2)
    MSE_val = np.mean((y_hat - Y_val)**2)
    MSE_test = np.mean((y_hat - Y_test)**2)
    ME_train = np.mean(np.abs(y_hat - Y_train))
    ME_val = np.mean(np.abs(y_hat - Y_val))
    ME_test = np.mean(np.abs(y_hat- Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')


Investigating performance of ElasticNet() model...
     |    Mean squared error    |        Mean  error       |       # samples      | #     |
Fold |--------+--------+--------|--------+--------+--------|-------+-------+------| feat- |
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  | Train |  Val  | Test | ures  |
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.1664 | 0.1504 | 0.0966 | 0.3417 | 0.3225 | 0.2632 | 20160 |  5040 |  48  |  113  |
   2 | 0.1620 | 0.1676 | 0.0972 | 0.3366 | 0.3429 | 0.2642 | 20160 |  5040 |  48  |  113  |
   3 | 0.1674 | 0.1463 | 0.0971 | 0.3423 | 0.3201 | 0.2641 | 20160 |  5040 |  48  |  113  |
   4 | 0.1621 | 0.1676 | 0.0968 | 0.3369 | 0.3417 | 0.2635 | 20160 |  5040 |  48  |  113  |
   5 | 0.1578 | 0.1848 | 0.0979 | 0.3316 | 0.3627 | 0.2654 | 20160 |  5040 |  48  |  113  |
Mean | 0.1631 | 0.1633 | 0.0971 | 0.3378 | 0.3380 | 0.2641 |
