In [1]:
import sklearn.linear_model
import numpy as np
import pandas as pd
import os
from SoundLights.features_groups import  ARAUS_features

normalise = lambda X: (X-np.mean(X,axis=0,keepdims=True))/np.std(X,axis=0,keepdims=True) # Normalise an (n,p) numpy array to mean 0, variance 1.
clip = lambda x, x_min = -1, x_max = 1: np.where(np.where(x < x_min,x_min,x) > x_max, x_max, np.where(x < x_min,x_min,x)) # Clip an array to values between x_min and x_max.

## 0) Prepare data

In [15]:
responses= pd.read_csv(os.path.join('..','data','SoundLights_ARAUS.csv'), dtype = {'info.participant':str}) #, dtype = {'participant':str}
responses=responses.drop("info.file", axis=1)
responses=responses.drop("info.participant", axis=1)

# Drop columns that contain all zero values
# Store column names before dropping
columns_before = responses.columns.tolist()
# Drop zero-columns
responses = responses.loc[:, (responses != 0).any(axis=0)]
# Store column names after dropping
columns_after = responses.columns.tolist()
# Determine which columns were dropped
columns_dropped = [col for col in columns_before if col not in columns_after]
# Drop those columns from ARAUS_features
ARAUS_features = [col for col in ARAUS_features if col not in columns_dropped]


responses.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24440 entries, 0 to 24439
Columns: 138 entries, info.fold to ARAUS.energy_frequency.20000_0
dtypes: float64(118), int64(18), object(2)
memory usage: 25.7+ MB


## 1) PLEASANTNESS

### 1.1) With default parameters

In [21]:
model = sklearn.linear_model.ElasticNet()
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.P_ground_truth'].values
    Y_val = df_val['info.P_ground_truth'].values
    Y_test = df_test['info.P_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    # Display important coefficients
    """ coefficients = X_LR.coef_
    feature_importances = list(zip(ARAUS_features, coefficients))
    feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)
    for feature, importance in feature_importances:
        if float(importance)!=0:
            print(f"{feature}: {importance}") """

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()


coefficients = X_LR.coef_
feature_importances = list(zip(ARAUS_features, coefficients))
feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)
for feature, importance in feature_importances:
    if float(importance)!=0:
        print(f"{feature}: {importance}")

Investigating performance of ElasticNet() model...
     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.1448 | 0.1416 | 0.0890 | 0.3142 | 0.3104 | 0.2451 |  0.2143 | 19160 |  5040 |  48  |  113  |   2   |
   2 | 0.1458 | 0.1375 | 0.0920 | 0.3155 | 0.3043 | 0.2497 |  0.2317 | 19160 |  5040 |  48  |  113  |   2   |
   3 | 0.1421 | 0.1569 | 0.0928 | 0.3110 | 0.3262 | 0.2509 |  0.1883 | 20160 |  4040 |  48  |  113  |   2   |
   4 | 0.1431 | 0.1501 | 0.0923 | 0.3115 | 0.3234 | 0.2501 |  0.2044 | 19160 |  5040 |  48  |  113  |   2   |
   5 | 0.1455 | 0.1404 | 0.0893 | 0.3150 | 0.3087 | 0.2456 |  0.1957 | 19160 

### 1.2) Defining parameters

In [54]:
from sklearn.linear_model import ElasticNet
# Define your ElasticNet model with specific hyperparameters
alpha = 0.1  # Example value, adjust as needed
l1_ratio = 0.7  # Example value, adjust as needed
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, selection="random")
#print(f'Investigating performance of {model} model...')

MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.P_ground_truth'].values
    Y_val = df_val['info.P_ground_truth'].values
    Y_test = df_test['info.P_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)

    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    # Display important coefficients
    """ coefficients = X_LR.coef_
    feature_importances = list(zip(ARAUS_features, coefficients))
    feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)
    for feature, importance in feature_importances:
        if float(importance)!=0:
            print(f"{feature}: {importance}") """

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()


coefficients = X_LR.coef_
feature_importances = list(zip(ARAUS_features, coefficients))
feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)
for feature, importance in feature_importances:
    if float(importance)!=0:
        print(f"{feature}: {importance}")

     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.1346 | 0.1333 | 0.0727 | 0.3002 | 0.2976 | 0.2285 |  0.5439 | 19160 |  5040 |  48  |  113  |  12   |
   2 | 0.1356 | 0.1317 | 0.0756 | 0.3014 | 0.2935 | 0.2313 |  0.1596 | 19160 |  5040 |  48  |  113  |  10   |
   3 | 0.1336 | 0.1430 | 0.0747 | 0.2992 | 0.3076 | 0.2307 |  0.7282 | 20160 |  4040 |  48  |  113  |  10   |
   4 | 0.1321 | 0.1424 | 0.0784 | 0.2963 | 0.3129 | 0.2384 |  0.4333 | 19160 |  5040 |  48  |  113  |  11   |
   5 | 0.1352 | 0.1341 | 0.0696 | 0.3006 | 0.3006 | 0.2212 |  0.4728 | 19160 |  5040 |  48  |  113  |  10   |
Mean | 0.1342 | 0.

## 2) EVENTFULNESS

### 2.1) With default parameters

In [49]:
from sklearn.linear_model import ElasticNet
# Define your ElasticNet model with specific hyperparameters

model = ElasticNet()
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.E_ground_truth'].values
    Y_val = df_val['info.E_ground_truth'].values
    Y_test = df_test['info.E_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()

# Assuming X_LR is your linear regression model
coefficients = X_LR.coef_

# Pairing coefficients with feature names
feature_importances = list(zip(ARAUS_features, coefficients))

# Sorting feature importances by absolute value of coefficient
feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)

# Displaying feature importances
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")

Investigating performance of ElasticNet() model...
     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.1412 | 0.1287 | 0.0388 | 0.3132 | 0.2961 | 0.1677 | -0.2973 | 19160 |  5040 |  48  |  113  |   1   |
   2 | 0.1395 | 0.1365 | 0.0405 | 0.3111 | 0.3066 | 0.1716 | -0.3033 | 19160 |  5040 |  48  |  113  |   2   |
   3 | 0.1390 | 0.1343 | 0.0394 | 0.3102 | 0.3043 | 0.1688 | -0.3629 | 20160 |  4040 |  48  |  113  |   2   |
   4 | 0.1391 | 0.1400 | 0.0414 | 0.3107 | 0.3109 | 0.1736 | -0.2811 | 19160 |  5040 |  48  |  113  |   1   |
   5 | 0.1352 | 0.1563 | 0.0437 | 0.3052 | 0.3338 | 0.1793 | -0.2872 | 19160 

### 2.2) Defining parameters

In [50]:
from sklearn.linear_model import ElasticNet
# Define your ElasticNet model with specific hyperparameters
alpha = 0.1  # Example value, adjust as needed
l1_ratio = 0.7  # Example value, adjust as needed
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
print(f'Investigating performance of {model} model...')
MSEs_train = []
MSEs_val = []
MSEs_test = []
MEs_train = []
MEs_val = []
MEs_test = []

print('     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ ')
print('Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-')
print('     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures ')
print('-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------')
for val_fold in [1,2,3,4,5]:

    # Extract dataframes
    df_train = responses[(responses['info.fold'] != val_fold) & (responses['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
    df_val   = responses[responses['info.fold'] == val_fold]
    df_test  = responses[responses['info.fold'] == 0].groupby(['info.soundscape','info.masker','info.smr']).mean() # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth

    # Get ground-truth labels
    Y_train = df_train['info.E_ground_truth'].values
    Y_val = df_val['info.E_ground_truth'].values
    Y_test = df_test['info.E_ground_truth'].values

    # Get features
    X_train = df_train[ARAUS_features].values
    X_val =df_val[ARAUS_features].values
    X_test = df_test[ARAUS_features].values    

    # Fit model
    X_LR = model.fit(X_train, Y_train)


    # Get MSEs
    MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
    MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
    MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
    ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
    ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
    ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))

    # Add metrics
    MSEs_train.append(MSE_train)
    MSEs_val.append(MSE_val)
    MSEs_test.append(MSE_test)
    MEs_train.append(ME_train)
    MEs_val.append(ME_val)
    MEs_test.append(ME_test)

    print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')

print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} |')
print()

# Assuming X_LR is your linear regression model
coefficients = X_LR.coef_

# Pairing coefficients with feature names
feature_importances = list(zip(ARAUS_features, coefficients))

# Sorting feature importances by absolute value of coefficient
feature_importances.sort(key=lambda x: abs(x[1]), reverse=True)

# Displaying feature importances
for feature, importance in feature_importances:
    print(f"{feature}: {importance}")

Investigating performance of ElasticNet(alpha=0.1, l1_ratio=0.7) model...
     |    Mean squared error    |        Mean  error       |         |       # samples      | #     | # NZ 
Fold |--------+--------+--------|--------+--------+--------| Inter-  |-------+-------+------| feat- | feat-
     | Train  |   Val  |  Test  | Train  |   Val  |  Test  |  cept   | Train |  Val  | Test | ures  | ures 
-----+--------+--------+--------+--------+--------+--------+---------+-------+-------+------+-------+------
   1 | 0.1340 | 0.1232 | 0.0311 | 0.3024 | 0.2873 | 0.1411 | -1.3419 | 19160 |  5040 |  48  |  113  |   9   |
   2 | 0.1309 | 0.1343 | 0.0370 | 0.2985 | 0.2991 | 0.1505 | -1.6071 | 19160 |  5040 |  48  |  113  |   7   |
   3 | 0.1317 | 0.1335 | 0.0352 | 0.2991 | 0.2999 | 0.1469 | -1.5710 | 20160 |  4040 |  48  |  113  |   9   |
   4 | 0.1319 | 0.1313 | 0.0333 | 0.2999 | 0.2991 | 0.1439 | -1.4248 | 19160 |  5040 |  48  |  113  |   9   |
   5 | 0.1274 | 0.1479 | 0.0346 | 0.2937 | 0.3232 | 0.