# REFMAP laboratory listening test 1 analysis: Exploratory data analysis — Part A parameter selection

## Setup

In [76]:
# import statements
import sys
import os
import numpy as np
import pandas as pd
from PyQt5.QtWidgets import QFileDialog, QApplication
from scipy import stats
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import pingouin as pg
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn import preprocessing, feature_selection 
from sklearn.model_selection import train_test_split
import joblib


In [77]:
# set plot parameters
sns.set_style('white')
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = 'Times New Roman'
mpl.rcParams.update({'font.size': 16})
mpl.rcParams['figure.autolayout'] = True
mpl.rcParams['mathtext.fontset'] = 'stix'

SMALL_SIZE = 9
MEDIUM_SIZE = 12
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE,
       labelsize=MEDIUM_SIZE)    # fontsize of the axes title and x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

mycolours = [(0, 102, 255), (0, 204, 153), (255, 0, 102), (74, 111, 152),
             (251, 164, 49), (204, 153, 255), (90, 192, 255), (80, 245, 233),
             (255, 90, 192), (164, 201, 242), (255, 254, 139), (255, 243, 255)]
mycolours = [tuple(shade/255 for shade in colour) for colour in mycolours]

# enable copy-on-write mode for Pandas (will be default from Pandas 3.0)
pd.options.mode.copy_on_write = True

N_CORES = joblib.cpu_count(only_physical_cores=True)

## Import data and organise

In [78]:
# import data
app = QApplication(sys.argv)
fileExts = "*.csv"

# Part A
dataByStimAFilePath = list(QFileDialog.getOpenFileName(filter=fileExts,
                                                       caption=r"Open refmap_listest1_testdataA_ByStim.csv in: \03 Experiment\Experiment 1\Analysis\PostProcess"))[0]
dataByStimTestA = pd.read_csv(dataByStimAFilePath, index_col=0)

# Part A notice data subselection
dataByStimANoticeFilePath = list(QFileDialog.getOpenFileName(filter=fileExts,
                                                             caption=r"Open refmap_listest1_testdataANoticeFilt_ByStim.csv in: \03 Experiment\Experiment 1\Analysis\PostProcess"))[0]
dataByStimTestANotice = pd.read_csv(dataByStimANoticeFilePath, index_col=0)

# Part A
partADataFilePath = list(QFileDialog.getOpenFileName(filter=fileExts,
                                                     caption=r"Open refmap_listest1_testdataA_BySubj.csv in: \03 Experiment\Experiment 1\Analysis\PostProcess"))[0]
partADataBySubj = pd.read_csv(partADataFilePath, index_col=False)

# Part A notice data subselection
partANoticeDataFilePath = list(QFileDialog.getOpenFileName(filter=fileExts,
                                                           caption=r"Open refmap_listest1_testdataANoticeFilt_BySubj.csv in: \03 Experiment\Experiment 1\Analysis\PostProcess"))[0]
partANoticeDataBySubj = pd.read_csv(partANoticeDataFilePath, index_col=False)


In [79]:
# categorise columns
SNRCats = ["No UAS", "-16", "-10", "-4", "2", "8"]
UASLAeqCats = ["No UAS", "42", "48", "54", "60"]
opCats = ["No UAS", "Landing", "Flyby", "Takeoff"]
vehicleCats = ["No UAS", "H520", "M300", "T150"]

for dataset in [dataByStimTestA, dataByStimTestANotice, partADataBySubj, partANoticeDataBySubj]:
    dataset['SNRlevel'] = pd.Categorical(dataset['SNRlevel'], SNRCats)
    dataset['UASLAeq'] = pd.Categorical(dataset['UASLAeq'], UASLAeqCats)
    dataset['UASOperation'] = pd.Categorical(dataset['UASOperation'], opCats)
    dataset['UASType'] = pd.Categorical(dataset['UASType'], vehicleCats)

## Exploratory data analysis

In this section, parameter comparisons are made to identify the most important parametric features to be used in further modelling.

### Random forest approach

In [80]:
# select subsection from aggregated data by stimulus 
colSelect = ['ValenceMedian', 'ArousalMedian', 'AnnoyMedian', 'HighAnnoyProportion']
partASubDataByStim = dataByStimTestA.loc[:, 'UASLAeq':'UASPartLoudGMSTPowAvg'].merge(dataByStimTestA.loc[:, colSelect], left_index=True, right_index=True)
partASubDataByStimNum = partASubDataByStim.copy()

# preprocess categories
# ordinal
ordCats = [SNRCats, UASLAeqCats]
for ii, cat in enumerate(['SNRlevel', 'UASLAeq']):
    encOrdinal = preprocessing.OrdinalEncoder(categories=[ordCats[ii]])
    partASubDataByStimNum[cat] = encOrdinal.fit_transform(partASubDataByStimNum[[cat]])

# nominal
encOneHot = preprocessing.OneHotEncoder(sparse_output=False).set_output(transform='pandas')
nomCats = ['UASOperation', 'UASType', 'AmbientEnv']
for ii, cat in enumerate(nomCats):
    oheTransform = encOneHot.fit_transform(partASubDataByStimNum[[cat]])
    partASubDataByStimNum = pd.concat([partASubDataByStimNum, oheTransform], axis=1)

partASubDataByStimNum.drop(columns=nomCats, inplace=True)


In [85]:
# run with numerical data only

# aggregated annoyance
X = np.array(partASubDataByStimNum.loc[:, :'UASPartLoudGMSTPowAvg'].values)
X = np.concatenate((X, partASubDataByStimNum.loc[:, 'UASOperation_Flyby':].values), axis=1)
y = partASubDataByStimNum['AnnoyMedian'].values

features = partASubDataByStimNum.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                    random_state=99)

rf = RandomForestRegressor(random_state=0, n_jobs=N_CORES)
rf.fit(X_train, y_train)
rf_f_i = list(zip(features, rf.feature_importances_))
rf_f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in rf_f_i],[x[1] for x in rf_f_i])
plt.show()


TypeError: __cinit__() takes exactly 5 positional arguments (6 given)

In [101]:
# run with numerical data only

# aggregated annoyance
#X = np.array(partASubDataByStim.loc[~partASubDataByStim.index.isin(['A1_CALBIN_Pa.wav', 'A2_CALBIN_Pa.wav']),
             #'UASLoudECMAHMSPowAvgBin':'UASImpulsHMSMaxMaxLR'].values)
#y = partASubDataByStim.loc[~partASubDataByStim.index.isin(['A1_CALBIN_Pa.wav', 'A2_CALBIN_Pa.wav']), 'AnnoyMedian'].values

X = np.random.random_sample(size=(80, 12))
y = np.random.random_sample(size=(80,))

features = partASubDataByStim.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
                                                    random_state=42)

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
rf_f_i = list(zip(features, rf.feature_importances_))
rf_f_i.sort(key = lambda x : x[1])
plt.barh([x[0] for x in rf_f_i],[x[1] for x in rf_f_i])
plt.show()


TypeError: __cinit__() takes exactly 5 positional arguments (6 given)

In [102]:
from sklearn.datasets import load_breast_cancer


In [103]:
X0, y0 = load_breast_cancer(return_X_y=True)

X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, test_size=0.33,
                                                        random_state=42)

N_CORES = joblib.cpu_count(only_physical_cores=True)

rf = RandomForestRegressor(random_state=0, n_jobs=N_CORES)
rf.fit(X_train,y_train)

TypeError: __cinit__() takes exactly 5 positional arguments (6 given)