In [223]:
import sklearn.linear_model
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd
import os
from SoundLights.dataset.features_groups import  general_info, ARAUS_features, Freesound_features, mix_features, masker_features
from SoundLights.models.models_functions import clip, normalize_columns, normalize_columns_minmax

In [224]:
def prepare_data_models(dataframe, features_evaluated, masker_transform:str="None", maskers_gain: float = 1):

    # Drop string columns
    """dataframe = dataframe.drop("info.file", axis=1)
    dataframe = dataframe.drop("info.participant", axis=1)"""

    # Maskers colum, increase values
    if(masker_transform=="-1,1"):
        dataframe["info.masker_bird"] = (dataframe["info.masker_bird"]*2-1) * maskers_gain
        dataframe["info.masker_construction"] = (
            (dataframe["info.masker_construction"]*2-1) * maskers_gain
        )
        dataframe["info.masker_traffic"] = (dataframe["info.masker_traffic"]*2-1) * maskers_gain
        dataframe["info.masker_silence"] = (dataframe["info.masker_silence"]*2-1) * maskers_gain
        dataframe["info.masker_water"] = (dataframe["info.masker_water"]*2-1) * maskers_gain
        dataframe["info.masker_wind"] = (dataframe["info.masker_wind"]*2-1) * maskers_gain
    else:
        dataframe["info.masker_bird"] = (dataframe["info.masker_bird"]) * maskers_gain
        dataframe["info.masker_construction"] = (
            dataframe["info.masker_construction"] * maskers_gain
        )
        dataframe["info.masker_traffic"] = dataframe["info.masker_traffic"] * maskers_gain
        dataframe["info.masker_silence"] = dataframe["info.masker_silence"] * maskers_gain
        dataframe["info.masker_water"] = dataframe["info.masker_water"] * maskers_gain
        dataframe["info.masker_wind"] = dataframe["info.masker_wind"] * maskers_gain

    # For fold 0, group data
    dataframe_fold0 = dataframe[dataframe["info.fold"] == 0]
    # Drop string columns
    dataframe_fold0 = dataframe_fold0.drop("info.file", axis=1)
    dataframe_fold0 = dataframe_fold0.drop("info.participant", axis=1)
    dataframe_fold0 = dataframe_fold0.groupby(
        ["info.soundscape", "info.masker", "info.smr"]
    ).mean()  # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth
    dataframe_filtered = dataframe[
        dataframe["info.fold"] != 0
    ]  # Filter rows where 'fold' column is not equal to 0
    dataframe = pd.concat(
        [dataframe_fold0, dataframe_filtered], ignore_index=True
    )  # Join together

    # Drop columns with all equal values or std=0
    std = np.std(dataframe[features_evaluated], axis=0)
    columns_to_mantain_arg = np.where(std >= 0.00001)[0]
    columns_to_drop_arg = np.where(std <= 0.00001)[0]
    columns_to_mantain = [features_evaluated[i] for i in columns_to_mantain_arg]
    columns_to_drop = [features_evaluated[i] for i in columns_to_drop_arg]
    # print(features_evaluated[np.where(std == 0)[0]])
    dataframe.drop(columns=columns_to_drop, inplace=True)

    return dataframe, columns_to_mantain



## PREPARE DATA 

#### Input general dataframe (folds 0,1,2,3,4,5)

In [225]:
df= pd.read_csv('../data/main_files/SoundLights_complete.csv')

print(df.info())
""" responses_Freesound, features=prepare_data_models(responses_Freesound, Freesound_features)
print(responses_Freesound) """


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25440 entries, 0 to 25439
Columns: 285 entries, CLAP to freesound.rhythm.bpm
dtypes: float64(261), int64(19), object(5)
memory usage: 55.3+ MB
None


' responses_Freesound, features=prepare_data_models(responses_Freesound, Freesound_features)\nprint(responses_Freesound) '

In [226]:
df_ARAUS=df[general_info+ARAUS_features]
df_Freesound=df[general_info+Freesound_features]
df_clap=df[general_info+["CLAP"]]

#### Input dataframe of new audios to validate (fold 6)

In [227]:
df_real= pd.read_csv('../data/main_files/SoundLights_fold6.csv')
only_important_features=ARAUS_features+Freesound_features+["CLAP", "info.P_ground_truth", "info.E_ground_truth"]+["info.masker_bird","info.masker_construction","info.masker_silence","info.masker_traffic", "info.masker_water","info.masker_wind"]
df_fold6=df_real[only_important_features]

print(df_fold6.columns)

Index(['ARAUS.sharpness.avg', 'ARAUS.sharpness.max', 'ARAUS.sharpness.p05',
       'ARAUS.sharpness.p10', 'ARAUS.sharpness.p20', 'ARAUS.sharpness.p30',
       'ARAUS.sharpness.p40', 'ARAUS.sharpness.p50', 'ARAUS.sharpness.p60',
       'ARAUS.sharpness.p70',
       ...
       'freesound.rhythm.bpm', 'CLAP', 'info.P_ground_truth',
       'info.E_ground_truth', 'info.masker_bird', 'info.masker_construction',
       'info.masker_silence', 'info.masker_traffic', 'info.masker_water',
       'info.masker_wind'],
      dtype='object', length=265)


## Check for best parameters

In [228]:
masker_gain=20
masker_transform="None"
df_to_use,features_to_use=prepare_data_models(df_ARAUS.copy(), ARAUS_features,masker_transform, masker_gain)
features_to_use=features_to_use+["info.masker_bird","info.masker_construction","info.masker_silence","info.masker_traffic", "info.masker_water","info.masker_wind"]

if(masker_transform=="-1,1"):
    df_fold6["info.masker_bird"]=(df_fold6["info.masker_bird"]*2-1)*masker_gain
    df_fold6["info.masker_construction"]=(df_fold6["info.masker_construction"]*2-1)*masker_gain
    df_fold6["info.masker_silence"]=(df_fold6["info.masker_silence"]*2-1)*masker_gain
    df_fold6["info.masker_traffic"]=(df_fold6["info.masker_traffic"]*2-1)*masker_gain
    df_fold6["info.masker_water"]=(df_fold6["info.masker_water"]*2-1)*masker_gain
    df_fold6["info.masker_wind"]=(df_fold6["info.masker_wind"]*2-1)*masker_gain
else:
    df_fold6["info.masker_bird"]=df_fold6["info.masker_bird"]*masker_gain
    df_fold6["info.masker_construction"]=df_fold6["info.masker_construction"]*masker_gain
    df_fold6["info.masker_silence"]=df_fold6["info.masker_silence"]*masker_gain
    df_fold6["info.masker_traffic"]=df_fold6["info.masker_traffic"]*masker_gain
    df_fold6["info.masker_water"]=df_fold6["info.masker_water"]*masker_gain
    df_fold6["info.masker_wind"]=df_fold6["info.masker_wind"]*masker_gain
#print(df_to_use[["info.masker_bird","info.masker_construction","info.masker_silence","info.masker_traffic", "info.masker_water","info.masker_wind"]])
#features_to_use=ARAUS_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fold6["info.masker_bird"]=df_fold6["info.masker_bird"]*masker_gain
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fold6["info.masker_construction"]=df_fold6["info.masker_construction"]*masker_gain
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fold6["info.masker_silence"]=df_fold6["info.ma

### Adjust alpha

In [229]:
import warnings
from sklearn.exceptions import ConvergenceWarning
# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from sklearn.linear_model import ElasticNet
# Define your ElasticNet model with specific hyperparameters
alpha = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] 
l1_ratio = 0.5
print('     |         Mean squared error        |             Mean  error            |')
print('Fold |--------+--------+--------+--------|--------+--------+--------|---------|')
print('     | Train  |   Val  |  Test  |Test(f6)| Train  |   Val  |  Test  | Test(f6)|')
print('-----+--------+--------+--------+--------+--------+--------+--------+----------')


prev_mean=9999
for value in alpha:

    model = ElasticNet(alpha=value, l1_ratio=l1_ratio, selection="random")
    #print(f'Investigating performance of {model} model...')

    MSEs_train = []
    MSEs_val = []
    MSEs_test = []
    MSEs_fold6 = []
    MEs_train = []
    MEs_val = []
    MEs_test = []
    MEs_fold6 = []

    
    for val_fold in [1,2,3,4,5]:
        
        # Extract dataframes
        df_train = df_to_use[(df_to_use['info.fold'] != val_fold) & (df_to_use['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
        df_val   = df_to_use[df_to_use['info.fold'] == val_fold]
        df_test  = df_to_use[df_to_use['info.fold'] == 0] 

        # Get ground-truth labels
        Y_train = df_train['info.P_ground_truth'].values
        Y_val = df_val['info.P_ground_truth'].values
        Y_test = df_test['info.P_ground_truth'].values
        Y_fold6 = df_fold6['info.P_ground_truth'].values


        # Get feature matrices
        X_train = df_train[features_to_use].values
        X_val =df_val[features_to_use].values
        X_test = df_test[features_to_use].values
        X_fold6 = df_fold6[features_to_use].values

        # Get features normalized_data = (data - mean) / (std)
        """ X_train, mean, std=normalize_columns(X_train)
        X_val= (X_val - mean) / (std)
        X_test= (X_test - mean) / (std)
        X_fold6= (X_fold6 - mean) / (std) """
        # Get features normalized_data = (data - min) / (max-min)
        """ X_train, min, max=normalize_columns_minmax(X_train)
        X_val= (X_val - min) / (max - min)
        X_test= (X_test - min) / (max - min)
        X_fold6= (X_fold6 - min) / (max - min) """

        # Fit model
        X_LR = model.fit(X_train, Y_train)

        # Get MSEs
        MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
        MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
        MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
        MSE_fold6 = np.mean((clip(X_LR.predict(X_fold6)) - Y_fold6)**2)
        ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
        ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
        ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))
        ME_fold6 = np.mean(np.abs(clip(X_LR.predict(X_fold6)) - Y_fold6))

        # Add metrics
        MSEs_train.append(MSE_train)
        MSEs_val.append(MSE_val)
        MSEs_test.append(MSE_test)
        MSEs_fold6.append(MSE_fold6)
        MEs_train.append(ME_train)
        MEs_val.append(ME_val)
        MEs_test.append(ME_test)
        MEs_fold6.append(ME_fold6)

        #print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')
    print("Parameters ",value, l1_ratio )
    print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MSEs_fold6):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} | {np.mean(MEs_fold6):.4f} |')

    current_mean=(np.mean(MEs_test)+np.mean(MEs_fold6))/2
    if current_mean<prev_mean:
        prev_mean=current_mean
        chosen=(value, l1_ratio)

    
print("Best parameters were ", chosen, " giving a mean of ", prev_mean)

     |         Mean squared error        |             Mean  error            |
Fold |--------+--------+--------+--------|--------+--------+--------|---------|
     | Train  |   Val  |  Test  |Test(f6)| Train  |   Val  |  Test  | Test(f6)|
-----+--------+--------+--------+--------+--------+--------+--------+----------
Parameters  0.1 0.5
Mean | 0.1239 | 0.1264 | 0.0822 | 0.1594 | 0.2863 | 0.2892 | 0.2486 | 0.3150 |
Parameters  0.2 0.5
Mean | 0.1250 | 0.1272 | 0.0809 | 0.1229 | 0.2882 | 0.2907 | 0.2445 | 0.2820 |
Parameters  0.3 0.5
Mean | 0.1266 | 0.1287 | 0.0809 | 0.0960 | 0.2907 | 0.2930 | 0.2419 | 0.2574 |
Parameters  0.4 0.5
Mean | 0.1286 | 0.1304 | 0.0815 | 0.0754 | 0.2936 | 0.2957 | 0.2406 | 0.2358 |
Parameters  0.5 0.5
Mean | 0.1307 | 0.1325 | 0.0825 | 0.0621 | 0.2966 | 0.2987 | 0.2401 | 0.2150 |
Parameters  0.6 0.5
Mean | 0.1330 | 0.1345 | 0.0837 | 0.0571 | 0.2997 | 0.3014 | 0.2402 | 0.2037 |
Parameters  0.7 0.5
Mean | 0.1354 | 0.1364 | 0.0852 | 0.0578 | 0.3028 | 0.3039 | 0.241

### Adjust l1_ratio

In [230]:
import warnings
from sklearn.exceptions import ConvergenceWarning
# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from sklearn.linear_model import ElasticNet
# Define your ElasticNet model with specific hyperparameters
alpha = 0.7
l1_ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] #0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
print('     |         Mean squared error        |             Mean  error            |')
print('Fold |--------+--------+--------+--------|--------+--------+--------|---------|')
print('     | Train  |   Val  |  Test  |Test(f6)| Train  |   Val  |  Test  | Test(f6)|')
print('-----+--------+--------+--------+--------+--------+--------+--------+----------')


prev_mean=9999
for value in l1_ratio:

    model = ElasticNet(alpha=alpha, l1_ratio=value, selection="random")
    #print(f'Investigating performance of {model} model...')

    MSEs_train = []
    MSEs_val = []
    MSEs_test = []
    MSEs_fold6 = []
    MEs_train = []
    MEs_val = []
    MEs_test = []
    MEs_fold6 = []

    
    for val_fold in [1,2,3,4,5]:

        
        # Extract dataframes
        df_train = df_to_use[(df_to_use['info.fold'] != val_fold) & (df_to_use['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
        df_val   = df_to_use[df_to_use['info.fold'] == val_fold]
        df_test  = df_to_use[df_to_use['info.fold'] == 0] 

        # Get ground-truth labels
        Y_train = df_train['info.P_ground_truth'].values
        Y_val = df_val['info.P_ground_truth'].values
        Y_test = df_test['info.P_ground_truth'].values
        Y_fold6 = df_fold6['info.P_ground_truth'].values


        # Get feature matrices
        X_train = df_train[features_to_use].values
        X_val =df_val[features_to_use].values
        X_test = df_test[features_to_use].values
        X_fold6 = df_fold6[features_to_use].values

       # Get features normalized_data = (data - mean) / (std)
        """ X_train, mean, std=normalize_columns(X_train)
        X_val= (X_val - mean) / (std)
        X_test= (X_test - mean) / (std)
        X_fold6= (X_fold6 - mean) / (std) """
        # Get features normalized_data = (data - min) / (max-min)
        """ X_train, min, max=normalize_columns_minmax(X_train)
        X_val= (X_val - min) / (max - min)
        X_test= (X_test - min) / (max - min)
        X_fold6= (X_fold6 - min) / (max - min) """

        # Fit model
        X_LR = model.fit(X_train, Y_train)

        # Get MSEs
        MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
        MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
        MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
        MSE_fold6 = np.mean((clip(X_LR.predict(X_fold6)) - Y_fold6)**2)
        ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
        ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
        ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))
        ME_fold6 = np.mean(np.abs(clip(X_LR.predict(X_fold6)) - Y_fold6))

        # Add metrics
        MSEs_train.append(MSE_train)
        MSEs_val.append(MSE_val)
        MSEs_test.append(MSE_test)
        MSEs_fold6.append(MSE_fold6)
        MEs_train.append(ME_train)
        MEs_val.append(ME_val)
        MEs_test.append(ME_test)
        MEs_fold6.append(ME_fold6)

        #print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')
    print("Parameters ",alpha, value )
    print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MSEs_fold6):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} | {np.mean(MEs_fold6):.4f} |')
    
    current_mean=(np.mean(MEs_test)+np.mean(MEs_fold6))/2
    if current_mean<prev_mean:
        prev_mean=current_mean
        chosen=(alpha, value)

    
print("Best parameters were ", chosen, " giving a mean of ", prev_mean)

     |         Mean squared error        |             Mean  error            |
Fold |--------+--------+--------+--------|--------+--------+--------|---------|
     | Train  |   Val  |  Test  |Test(f6)| Train  |   Val  |  Test  | Test(f6)|
-----+--------+--------+--------+--------+--------+--------+--------+----------
Parameters  0.7 0.1
Mean | 0.1244 | 0.1267 | 0.0812 | 0.1398 | 0.2871 | 0.2898 | 0.2460 | 0.2977 |
Parameters  0.7 0.2
Mean | 0.1263 | 0.1284 | 0.0808 | 0.0992 | 0.2903 | 0.2926 | 0.2422 | 0.2605 |
Parameters  0.7 0.3
Mean | 0.1290 | 0.1308 | 0.0817 | 0.0717 | 0.2943 | 0.2963 | 0.2405 | 0.2310 |
Parameters  0.7 0.4
Mean | 0.1321 | 0.1337 | 0.0831 | 0.0582 | 0.2985 | 0.3003 | 0.2400 | 0.2076 |
Parameters  0.7 0.5
Mean | 0.1354 | 0.1364 | 0.0852 | 0.0578 | 0.3028 | 0.3039 | 0.2415 | 0.2008 |
Parameters  0.7 0.6
Mean | 0.1376 | 0.1384 | 0.0869 | 0.0613 | 0.3058 | 0.3066 | 0.2440 | 0.2057 |
Parameters  0.7 0.7
Mean | 0.1394 | 0.1400 | 0.0892 | 0.0639 | 0.3081 | 0.3088 | 0.247