In [103]:
import sklearn.linear_model
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd
import os
from SoundLights.dataset.features_groups import  general_info, ARAUS_features, Freesound_features, mix_features, masker_features, clap_features
from SoundLights.models.models_functions import clip, normalize_columns, normalize_columns_minmax

In [104]:
def prepare_data_models(dataframe, features_evaluated, masker_transform:str="None", maskers_gain: float = 1):

    # Drop string columns
    """dataframe = dataframe.drop("info.file", axis=1)
    dataframe = dataframe.drop("info.participant", axis=1)"""

    # Maskers colum, increase values
    if(masker_transform=="-1,1"):
        dataframe["info.masker_bird"] = (dataframe["info.masker_bird"]*2-1) * maskers_gain
        dataframe["info.masker_construction"] = (
            (dataframe["info.masker_construction"]*2-1) * maskers_gain
        )
        dataframe["info.masker_traffic"] = (dataframe["info.masker_traffic"]*2-1) * maskers_gain
        dataframe["info.masker_silence"] = (dataframe["info.masker_silence"]*2-1) * maskers_gain
        dataframe["info.masker_water"] = (dataframe["info.masker_water"]*2-1) * maskers_gain
        dataframe["info.masker_wind"] = (dataframe["info.masker_wind"]*2-1) * maskers_gain
    else:
        dataframe["info.masker_bird"] = (dataframe["info.masker_bird"]) * maskers_gain
        dataframe["info.masker_construction"] = (
            dataframe["info.masker_construction"] * maskers_gain
        )
        dataframe["info.masker_traffic"] = dataframe["info.masker_traffic"] * maskers_gain
        dataframe["info.masker_silence"] = dataframe["info.masker_silence"] * maskers_gain
        dataframe["info.masker_water"] = dataframe["info.masker_water"] * maskers_gain
        dataframe["info.masker_wind"] = dataframe["info.masker_wind"] * maskers_gain

    # For fold 0, group data
    dataframe_fold0 = dataframe[dataframe["info.fold"] == 0]
    # Drop string columns
    print("\n dataframe fold 0 before anything", dataframe_fold0.info())
    print(" ----------------------------- ")
    dataframe_fold0 = dataframe_fold0.drop("info.file", axis=1)
    dataframe_fold0 = dataframe_fold0.drop("info.participant", axis=1)
    dataframe_fold0 = dataframe_fold0.groupby(
        ["info.soundscape", "info.masker", "info.smr"]
    ).mean()#.reset_index()  # For the test set, the same 48 stimuli were shown to all participants so we take the mean of their ratings as the ground truth
    print("\n dataframe fold 0 after drop and groupby", dataframe_fold0.info())
    print(" ----------------------------- ")
    #print("\n dataframe fold 0 has infoo.soundscape????", dataframe_fold0["info.soundscape"])
    dataframe_filtered = dataframe[
        dataframe["info.fold"] != 0
    ]  # Filter rows where 'fold' column is not equal to 0
    print("\n dataframe fildered info", dataframe_filtered.info())
    print(" ----------------------------- ")
    dataframe = pd.concat(
        [dataframe_fold0, dataframe_filtered], ignore_index=True
    )  # Join together

    print("\n dataframe concat", dataframe.columns)
    print(" ----------------------------- ")

    # Drop columns with all equal values or std=0
    std = np.std(dataframe[features_evaluated], axis=0)
    columns_to_mantain_arg = np.where(std >= 0.00001)[0]
    columns_to_drop_arg = np.where(std < 0.00001)[0]
    columns_to_mantain = [features_evaluated[i] for i in columns_to_mantain_arg]
    columns_to_drop = [features_evaluated[i] for i in columns_to_drop_arg]
    print("columns to drop ", columns_to_drop)
    print(" ----------------------------- ")
    # print(features_evaluated[np.where(std == 0)[0]])
    dataframe.drop(columns=columns_to_drop, inplace=True)

    return dataframe, columns_to_mantain



## PREPARE DATA 

#### Input general dataframe (folds 0,1,2,3,4,5)

In [105]:
df= pd.read_csv('../data/main_files/SoundLights_complete.csv')

print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25440 entries, 0 to 25439
Columns: 285 entries, CLAP to freesound.rhythm.bpm
dtypes: float64(261), int64(19), object(5)
memory usage: 55.3+ MB
None


### Split into sections of data to use (ARAUS, Freesound or CLAP)

In [106]:
# ARAUS features dataframe
df_ARAUS=df[general_info+ARAUS_features]
# Freesound features dataframe
df_Freesound=df[general_info+Freesound_features]
# CLAP embeddings dataframe
df_clap=df[general_info+["CLAP"]]
#print(df_clap["CLAP"].values[1])
#print(df_clap["info.P_ground_truth"].values[1])
all_columns=general_info+clap_features
full_list=[]
for index, row in df_clap.iterrows():
    string_list=row["CLAP"].split("[")[2].split("]")[0].split(",")
    clap_list = [float(item) for item in string_list]
    #clap_list=clap_list[0:101] ##############################!!!!!!!!!!!!
    complete_new_row=list(row[general_info].values)+clap_list
    full_list.append(complete_new_row)
df_clap=pd.DataFrame(data=full_list, columns=all_columns)
#print(df_clap.iloc[1][["info.P_ground_truth", "clap_0","clap_1","clap_2" ]])


#### Input dataframe of new audios to validate (fold 6)

In [107]:
df_real= pd.read_csv('../data/main_files/SoundLights_fold6.csv')
# Adapt CLAP features
df_fold6=df_real[ARAUS_features+Freesound_features+masker_features+["info.P_ground_truth", "info.E_ground_truth", "CLAP"]]
all_columns=ARAUS_features+Freesound_features+masker_features+["info.P_ground_truth", "info.E_ground_truth"]+clap_features
full_list=[]
for index, row in df_fold6.iterrows():
    string_list=row["CLAP"].split("[")[1].split("]")[0].split(",")
    clap_list = [float(item) for item in string_list]
    #clap_list=clap_list[0:101] ##############################!!!!!!!!!!!!
    complete_new_row=list(row[ARAUS_features+Freesound_features+masker_features+["info.P_ground_truth", "info.E_ground_truth"]].values)+clap_list
    full_list.append(complete_new_row)
df_fold6=pd.DataFrame(data=full_list, columns=all_columns)


### Select data to evaluate and adapt masker features if desired

Here is where changes have to be made to try different configurations

1) Which dataframe/set of features to evaluate. Change input df and features to prepare_data_models()
2) Decide if maskers are used or not by adding them to features_to_use
3) Decide if maskers are transformed by changing masker_gain and/or masker_transform
4) To add normalizations, discomment the code that normalises

In [117]:
#!!!!!! CHANGE
masker_gain=1
masker_transform="None" #"-1,1"
#!!!!!! CHANGE

df_to_use,features_to_use=prepare_data_models(df_clap.copy(), clap_features,masker_transform, masker_gain) #!!!!!! CHANGE DATAFRAME AND FEATURES

#!!!!!! CHANGE
features_to_use=features_to_use#+["info.masker_bird","info.masker_construction","info.masker_silence","info.masker_traffic", "info.masker_water","info.masker_wind"] 
#!!!!!! CHANGE
pd.options.mode.chained_assignment = None  # Ignore warning, default='warn'
if(masker_transform=="-1,1"):
    df_fold6["info.masker_bird"]=(df_fold6["info.masker_bird"]*2-1)*masker_gain
    df_fold6["info.masker_construction"]=(df_fold6["info.masker_construction"]*2-1)*masker_gain
    df_fold6["info.masker_silence"]=(df_fold6["info.masker_silence"]*2-1)*masker_gain
    df_fold6["info.masker_traffic"]=(df_fold6["info.masker_traffic"]*2-1)*masker_gain
    df_fold6["info.masker_water"]=(df_fold6["info.masker_water"]*2-1)*masker_gain
    df_fold6["info.masker_wind"]=(df_fold6["info.masker_wind"]*2-1)*masker_gain
else:
    df_fold6["info.masker_bird"]=df_fold6["info.masker_bird"]*masker_gain
    df_fold6["info.masker_construction"]=df_fold6["info.masker_construction"]*masker_gain
    df_fold6["info.masker_silence"]=df_fold6["info.masker_silence"]*masker_gain
    df_fold6["info.masker_traffic"]=df_fold6["info.masker_traffic"]*masker_gain
    df_fold6["info.masker_water"]=df_fold6["info.masker_water"]*masker_gain
    df_fold6["info.masker_wind"]=df_fold6["info.masker_wind"]*masker_gain

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240 entries, 0 to 239
Columns: 540 entries, info.file to clap_511
dtypes: float64(517), int64(19), object(4)
memory usage: 1014.4+ KB

 dataframe fold 0 before anything None
 ----------------------------- 
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 48 entries, ('R1001_segment_binaural_44100.wav', 'bird_10001.wav', 0) to ('R1008_segment_binaural_44100.wav', 'wind_10001.wav', 0)
Columns: 535 entries, info.fold to clap_511
dtypes: float64(535)
memory usage: 201.1+ KB

 dataframe fold 0 after drop and groupby None
 ----------------------------- 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 25200 entries, 240 to 25439
Columns: 540 entries, info.file to clap_511
dtypes: float64(517), int64(19), object(4)
memory usage: 104.0+ MB

 dataframe fildered info None
 ----------------------------- 

 dataframe concat Index(['info.fold', 'info.stimulus_index', 'info.wav_gain', 'info.time_taken',
       'info.is_attention', 'info.pleasant', 

## Check for best parameters - ARAUS or Freesound features

### Adjust n_neighbors

In [118]:
import warnings
from sklearn.exceptions import ConvergenceWarning
# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from sklearn.neighbors import KNeighborsRegressor

print('     |         Mean squared error        |             Mean  error            |')
print('Fold |--------+--------+--------+--------|--------+--------+--------|---------|')
print('     | Train  |   Val  |  Test  |Test(f6)| Train  |   Val  |  Test  | Test(f6)|')
print('-----+--------+--------+--------+--------+--------+--------+--------+----------')
n_neighbors=[10,20,50,100, 150, 180, 200, 250, 300, 350, 400, 500]

prev_mean=9999
for value in n_neighbors:

    model = KNeighborsRegressor(n_neighbors=value) #, weights="distance"
    #print(f'Investigating performance of {model} model...')

    MSEs_train = []
    MSEs_val = []
    MSEs_test = []
    MSEs_fold6 = []
    MEs_train = []
    MEs_val = []
    MEs_test = []
    MEs_fold6 = []

    
    for val_fold in [1,2,3,4,5]:
        
        # Extract dataframes
        df_train = df_to_use[(df_to_use['info.fold'] != val_fold) & (df_to_use['info.fold'] > 0)] # For the training set, use all samples that are not in the test set (fold 0) and current validation fold.
        df_val   = df_to_use[df_to_use['info.fold'] == val_fold]
        df_test  = df_to_use[df_to_use['info.fold'] == 0] 


        # Get ground-truth labels
        Y_train = df_train['info.P_ground_truth'].values#[0:10]
        Y_val = df_val['info.P_ground_truth'].values
        Y_test = df_test['info.P_ground_truth'].values
        Y_fold6 = df_fold6['info.P_ground_truth'].values


        # Get feature matrices
        X_train = df_train[features_to_use].values#[:,0:100]
        X_val =df_val[features_to_use].values#[:,0:100]
        X_test = df_test[features_to_use].values#[:,0:100]
        X_fold6 = df_fold6[features_to_use].values#[:,0:100]

        # Get features normalized_data = (data - mean) / (std)
        """ X_train, mean, std=normalize_columns(X_train)
        X_val= (X_val - mean) / (std)
        X_test= (X_test - mean) / (std)
        X_fold6= (X_fold6 - mean) / (std) """
        # Get features normalized_data = (data - min) / (max-min)
        """ X_train, min, max=normalize_columns_minmax(X_train)
        X_val= (X_val - min) / (max - min)
        X_test= (X_test - min) / (max - min)
        X_fold6= (X_fold6 - min) / (max - min) """

        # Fit model
        X_LR = model.fit(X_train, Y_train)
        print(".")
        #print("iterations ", X_LR.n_iter_, X_LR.n_features_in_)

        # Get MSEs
        MSE_train = np.mean((clip(X_LR.predict(X_train)) - Y_train)**2)
        MSE_val = np.mean((clip(X_LR.predict(X_val)) - Y_val)**2)
        MSE_test = np.mean((clip(X_LR.predict(X_test)) - Y_test)**2)
        MSE_fold6 = np.mean((clip(X_LR.predict(X_fold6)) - Y_fold6)**2)
        ME_train = np.mean(np.abs(clip(X_LR.predict(X_train)) - Y_train))
        ME_val = np.mean(np.abs(clip(X_LR.predict(X_val)) - Y_val))
        ME_test = np.mean(np.abs(clip(X_LR.predict(X_test)) - Y_test))
        ME_fold6 = np.mean(np.abs(clip(X_LR.predict(X_fold6)) - Y_fold6))

        # Add metrics
        MSEs_train.append(MSE_train)
        MSEs_val.append(MSE_val)
        MSEs_test.append(MSE_test)
        MSEs_fold6.append(MSE_fold6)
        MEs_train.append(ME_train)
        MEs_val.append(ME_val)
        MEs_test.append(ME_test)
        MEs_fold6.append(ME_fold6)

        #print(f'{val_fold:4d} | {MSE_train:.4f} | {MSE_val:.4f} | {MSE_test:.4f} | {ME_train:.4f} | {ME_val:.4f} | {ME_test:.4f} | {X_LR.intercept_:7.4f} | {X_train.shape[0]:5d} | {X_val.shape[0]:5d} | {X_test.shape[0]:^4d} | {X_train.shape[1]:^5d} | {np.sum(np.abs(X_LR.coef_) > 0):^5d} |')
    print("Parameters ",value)
    print(f'Mean | {np.mean(MSEs_train):.4f} | {np.mean(MSEs_val):.4f} | {np.mean(MSEs_test):.4f} | {np.mean(MSEs_fold6):.4f} | {np.mean(MEs_train):.4f} | {np.mean(MEs_val):.4f} | {np.mean(MEs_test):.4f} | {np.mean(MEs_fold6):.4f} |')

    current_mean=(np.mean(MEs_val)+np.mean(MEs_test)+np.mean(MEs_fold6))/3
    if current_mean<prev_mean:
        prev_mean=current_mean
        chosen=(value)

    
print("Best parameters were ", chosen, " giving a mean of ", prev_mean)

     |         Mean squared error        |             Mean  error            |
Fold |--------+--------+--------+--------|--------+--------+--------|---------|
     | Train  |   Val  |  Test  |Test(f6)| Train  |   Val  |  Test  | Test(f6)|
-----+--------+--------+--------+--------+--------+--------+--------+----------
.
.
.
.
.
Parameters  10
Mean | 0.0939 | 0.1329 | 0.1320 | 0.0342 | 0.2453 | 0.2939 | 0.3061 | 0.1555 |
.
.
.
.
.
Parameters  20
Mean | 0.1018 | 0.1271 | 0.1166 | 0.0288 | 0.2567 | 0.2878 | 0.2903 | 0.1383 |
.
.
.
.
.
Parameters  50
Mean | 0.1094 | 0.1235 | 0.1044 | 0.0291 | 0.2671 | 0.2849 | 0.2759 | 0.1439 |
.
.
.
.
.
Parameters  100
Mean | 0.1147 | 0.1239 | 0.1030 | 0.0327 | 0.2748 | 0.2860 | 0.2745 | 0.1540 |
.
.
.
.
.
Parameters  150
Mean | 0.1173 | 0.1245 | 0.1046 | 0.0352 | 0.2785 | 0.2872 | 0.2773 | 0.1607 |
.
.
.
.
.
Parameters  180
Mean | 0.1185 | 0.1249 | 0.1051 | 0.0364 | 0.2801 | 0.2878 | 0.2776 | 0.1633 |
.
.
.
.
.
Parameters  200
Mean | 0.1192 | 0.1251 | 0.