## 1 - Questionnaire Data Preparation

##### Imports

In [2]:
import os
import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask.array as da     #scalable parallel computing
import json
import warnings

##### Data Loading

In [3]:
# Load ddfs into dictionary (exception for 'all_surf_pos', df)
root_dir = './aggregated_data'
def load_data(root_dir, exception=False):    # Get csv file reads into one dictionary
    data = {}
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            file_path = os.path.join(subdir, file)
            file_name = os.path.splitext(file)[0]   #file name without '.csv'
            if file.endswith('.csv') and file != 'all_surf_positions_HiDrive_Studie2.csv':
                data[file_name] = dd.read_csv(file_path)    #read and attach to dict
            elif exception and file == 'all_surf_positions_HiDrive_Studie2.csv':
                data [file_name] = pd.read_csv(file_path, converters=converters)    #read as normal and attach to dict
    return data
def parse(filedata): # Manually read the column
    output = []
    for line in filedata.split('\n'): # split into lines
        line = line.strip().rstrip(']').lstrip('[') #remove whitespace and brackets
        if not line:  
            continue    #skip empty lines
        line = line.split() #split into cell
        row = []
        for cell in line:
            cell = cell.strip()     #remove whitespace
            if not cell.strip():
                continue    #skip empty cells
            row.append(float(cell)) #convert to float and add
        output.append(row)
    return output
converters = {
    "img_to_surf_trans": parse,
    "surf_to_img_trans": parse,
    "dist_img_to_surf_trans": parse,
    "surf_to_dist_img_trans": parse,
}

data = load_data(root_dir)

In [4]:
participant_ids = [1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

### 0. File Exploration

##### File reading

In [5]:
data_df = pd.read_csv("./questionnaire_data/data_hidrives2db_2024-06-19_11-43.csv", encoding='UTF-16', delimiter='\t')
variables_df = pd.read_csv("./questionnaire_data/variables_hidrives2db_2024-03-22_09-58.csv", encoding='UTF-16', delimiter='\t')
values_df = pd.read_csv("./questionnaire_data/values_hidrives2db_2024-03-22_09-59.csv", encoding='UTF-16', delimiter='\t')

In [6]:
data_df.head(5)

Unnamed: 0,CASE,SERIAL,REF,QUESTNNR,MODE,STARTED,AS01_01,AS01_02,AS01_03,AS01_04,...,MAILSENT,LASTDATA,FINISHED,Q_VIEWER,LASTPAGE,MAXPAGE,MISSING,MISSREL,TIME_RSI,DEG_TIME
0,Interview-Nummer (fortlaufend),Seriennummer (sofern verwendet),Referenz (sofern im Link angegeben),"Fragebogen, der im Interview verwendet wurde",Interview-Modus,Zeitpunkt zu dem das Interview begonnen hat (E...,vdl: Nützlich/Nutzlos,vdl: Angenehm/Unangenehm,vdl: Schlecht/Gut,vdl: Nett/Nervig,...,Versandzeitpunkt der Einladungsmail (nur für n...,Zeitpunkt als der Datensatz das letzte mal geä...,Wurde die Befragung abgeschlossen (letzte Seit...,Hat der Teilnehmer den Fragebogen nur angesehe...,"Seite, die der Teilnehmer zuletzt bearbeitet hat","Letzte Seite, die im Fragebogen bearbeitet wurde",Anteil fehlender Antworten in Prozent,Anteil fehlender Antworten (gewichtet nach Rel...,Maluspunkte für schnelles Ausfüllen,Maluspunkte für schnelles Ausfüllen
1,16,,,qnr2,interview,2024-03-13 09:50:40,2,3,3,4,...,,2024-03-13 11:51:47,1,0,26,26,0,0,135,71
2,19,,,qnr2,interview,2024-03-13 12:18:14,2,2,4,3,...,,2024-03-13 13:53:50,1,0,26,26,0,0,163,90
3,21,,,qnr2,interview,2024-03-14 08:44:34,1,1,5,2,...,,2024-03-14 11:21:45,1,0,26,26,0,0,108,26
4,22,,,qnr2,interview,2024-03-14 12:01:54,1,1,4,2,...,,2024-03-14 13:51:46,1,0,26,26,0,0,099,27


In [7]:
variables_df

Unnamed: 0,VAR,LABEL,TYPE,INPUT,QUESTION
0,CASE,Interview-Nummer (fortlaufend),METRIC,SYSTEM,
1,SERIAL,Seriennummer (sofern verwendet),TEXT,SYSTEM,
2,REF,Referenz (sofern im Link angegeben),TEXT,SYSTEM,
3,QUESTNNR,"Fragebogen, der im Interview verwendet wurde",TEXT,SYSTEM,
4,MODE,Interview-Modus,TEXT,SYSTEM,
...,...,...,...,...,...
185,MAXPAGE,"Letzte Seite, die im Fragebogen bearbeitet wurde",METRIC,SYSTEM,
186,MISSING,Anteil fehlender Antworten in Prozent,METRIC,SYSTEM,
187,MISSREL,Anteil fehlender Antworten (gewichtet nach Rel...,METRIC,SYSTEM,
188,TIME_RSI,Maluspunkte für schnelles Ausfüllen,METRIC,SYSTEM,


In [8]:
values_df

Unnamed: 0,VAR,RESPONSE,MEANING
0,AS01_01,1,Nützlich
1,AS01_01,5,Nutzlos
2,AS01_01,-9,nicht beantwortet
3,AS01_02,1,Angenehm
4,AS01_02,5,Unangenehm
...,...,...,...
620,T110_06,-9,nicht beantwortet
621,FINISHED,0,abgebrochen
622,FINISHED,1,ausgefüllt
623,Q_VIEWER,0,Teilnehmer


##### Columns and Values

In [9]:
data_df.columns.tolist()

['CASE',
 'SERIAL',
 'REF',
 'QUESTNNR',
 'MODE',
 'STARTED',
 'AS01_01',
 'AS01_02',
 'AS01_03',
 'AS01_04',
 'AS01_05',
 'AS01_06',
 'AS01_07',
 'AS01_08',
 'AS01_09',
 'AT02_01',
 'AT02_02',
 'AT02_03',
 'AT02_04',
 'AT02_05',
 'AT02_06',
 'AT02_07',
 'AT02_08',
 'AT02_09',
 'C103_01',
 'C103_02',
 'C103_03',
 'C103_04',
 'C103_05',
 'C103_06',
 'C103_07',
 'C103_08',
 'C103_09',
 'C103_10',
 'C103_11',
 'C103_12',
 'C103_13',
 'C103_14',
 'C103_15',
 'C103_16',
 'C103_17',
 'DF01_01',
 'DF01_02',
 'DQ02_01',
 'DQ02_02',
 'DQ02_03',
 'DQ02_04',
 'DQ02_05',
 'DQ02_06',
 'DQ02_07',
 'DQ02_08',
 'DQ02_09',
 'DQ02_10',
 'DQ02_11',
 'DQ02_12',
 'DQ02_13',
 'DQ02_14',
 'DQ02_15',
 'DQ02_16',
 'DQ02_17',
 'DQ02_18',
 'DQ02_19',
 'DQ02_20',
 'DQ02_21',
 'DQ02_22',
 'DQ02_23',
 'DQ02_24',
 'FL04_01',
 'FL05_01',
 'FL06_01',
 'FL07_01',
 'FL08_01',
 'FL09_01',
 'FL10_01',
 'FL11_01',
 'SA01_01',
 'SA02',
 'SA03',
 'SA04',
 'SA04_01',
 'SA04_02',
 'SA05',
 'SA05_09',
 'SA07',
 'SA22_01',
 'SA2

In [10]:
data_df.iloc[0]

for item in data_df.iloc[0]:
    print(item)

Interview-Nummer (fortlaufend)
Seriennummer (sofern verwendet)
Referenz (sofern im Link angegeben)
Fragebogen, der im Interview verwendet wurde
Interview-Modus
Zeitpunkt zu dem das Interview begonnen hat (Europe/Berlin)
vdl: Nützlich/Nutzlos
vdl: Angenehm/Unangenehm
vdl: Schlecht/Gut
vdl: Nett/Nervig
vdl: Effektiv/Unnötig
vdl: Ärgerlich/Erfreulich
vdl: Hilfreich/Wertlos
vdl: Nicht wünschenswert/Wünschenswert
vdl: Aktivierend/Einschläfernd
ATI: Ich beschäftige mich gerne genauer mit technischen Systemen.
ATI: Ich probiere gerne die Funktionen neuer technischer Systeme aus.
ATI: In erster Linie beschäftige ich mich mit technischen Systemen, weil ich muss.
ATI: Wenn ich ein neues technisches System vor mir habe, probiere ich es intensiv aus.
ATI: Ich verbringe sehr gerne Zeit mit dem Kennenlernen eines neuen technischen Systems.
ATI: Es genügt mir, dass ein technisches System funktioniert, mir ist es egal, wie oder warum.
ATI: Ich versuche zu verstehen, wie ein technisches System genau fu

Questionnaire parts:
- AS: Acceptance Scale (9 items), Van der Laan (-2; +2) [https://www.hfes-europe.org/accept/accept.htm]
- AT: Affinity for Technology Interaaction scale (ATI) (compl. disagree; compl. agree) [https://ati-scale.org/]
- CGAS: Computer game Attitude Scale (CGAS) (1, 5)
- DQ: Driver behavior Questionnaire (DBQ) (1, 6)
- (?) FL: Requirements (1, 9)
- SA01-07: Demographic
- SA22: Videogame frequency (1, 6)
- (?) SA08-12: AV and RO
- SU: System Usability Scale (SUS) (1, 5)
- T1: NASA Task-Load Index (NASA-TLX) (1, 21)

### 1. File Reading and Formatting

In [11]:
# Read csv files into dataframes
data_df = pd.read_csv("./questionnaire_data/data_hidrives2db_2024-06-19_11-43.csv", encoding='UTF-16', delimiter='\t')
variables_df = pd.read_csv("./questionnaire_data/variables_hidrives2db_2024-03-22_09-58.csv", encoding='UTF-16', delimiter='\t')
values_df = pd.read_csv("./questionnaire_data/values_hidrives2db_2024-03-22_09-59.csv", encoding='UTF-16', delimiter='\t')

In [12]:
data_df.head(5)

Unnamed: 0,CASE,SERIAL,REF,QUESTNNR,MODE,STARTED,AS01_01,AS01_02,AS01_03,AS01_04,...,MAILSENT,LASTDATA,FINISHED,Q_VIEWER,LASTPAGE,MAXPAGE,MISSING,MISSREL,TIME_RSI,DEG_TIME
0,Interview-Nummer (fortlaufend),Seriennummer (sofern verwendet),Referenz (sofern im Link angegeben),"Fragebogen, der im Interview verwendet wurde",Interview-Modus,Zeitpunkt zu dem das Interview begonnen hat (E...,vdl: Nützlich/Nutzlos,vdl: Angenehm/Unangenehm,vdl: Schlecht/Gut,vdl: Nett/Nervig,...,Versandzeitpunkt der Einladungsmail (nur für n...,Zeitpunkt als der Datensatz das letzte mal geä...,Wurde die Befragung abgeschlossen (letzte Seit...,Hat der Teilnehmer den Fragebogen nur angesehe...,"Seite, die der Teilnehmer zuletzt bearbeitet hat","Letzte Seite, die im Fragebogen bearbeitet wurde",Anteil fehlender Antworten in Prozent,Anteil fehlender Antworten (gewichtet nach Rel...,Maluspunkte für schnelles Ausfüllen,Maluspunkte für schnelles Ausfüllen
1,16,,,qnr2,interview,2024-03-13 09:50:40,2,3,3,4,...,,2024-03-13 11:51:47,1,0,26,26,0,0,135,71
2,19,,,qnr2,interview,2024-03-13 12:18:14,2,2,4,3,...,,2024-03-13 13:53:50,1,0,26,26,0,0,163,90
3,21,,,qnr2,interview,2024-03-14 08:44:34,1,1,5,2,...,,2024-03-14 11:21:45,1,0,26,26,0,0,108,26
4,22,,,qnr2,interview,2024-03-14 12:01:54,1,1,4,2,...,,2024-03-14 13:51:46,1,0,26,26,0,0,099,27


In [13]:
##### Questionnaire correction
data_df.loc[data_df['DF01_01']=='1', 'DF01_02'] = '41235768'
    
# wrong name encoding added for VP 3
data_df.loc[data_df['DF01_01']=="03", 'DF01_01'] = '3'
      
# First entry was incorrectly filled for VP 12, shifts all others by 1
data_df.loc[data_df['DF01_01']=='12', ['T103_01', 'T104_01','T105_01','T106_01','T107_01','T108_01','T109_01',
                             'T103_04', 'T104_04','T105_04','T106_04','T107_04','T108_04','T109_04',
                             'T103_05', 'T104_05','T105_05','T106_05','T107_05','T108_05','T109_05',
                             'T103_06', 'T104_06','T105_06','T106_06','T107_06','T108_06','T109_06',
                             'T103_07', 'T104_07','T105_07','T106_07','T107_07','T108_07','T109_07',
                             'FL04_01', 'FL05_01', 'FL06_01', 'FL07_01', 'FL08_01','FL09_01','FL10_01'
                             ]] = np.array(data_df.loc[data_df['DF01_01']=='12', [
                                 'T104_01','T105_01','T106_01','T107_01','T108_01','T109_01','T110_01',
                                 'T104_04','T105_04','T106_04','T107_04','T108_04','T109_04','T110_04',
                                 'T104_05','T105_05','T106_05','T107_05','T108_05','T109_05','T110_05',
                                 'T104_06','T105_06','T106_06','T107_06','T108_06','T109_06','T110_06',
                                 'T104_07','T105_07','T106_07','T107_07','T108_07','T109_07','T110_07',
                                 'FL05_01', 'FL06_01', 'FL07_01', 'FL08_01','FL09_01','FL10_01','FL11_01'
                             ]].astype(int)) 
      
# adding last entry from manual filling of paper sheet for VP 12             
data_df.loc[data_df['DF01_01']=='12', ['T110_01', 'T110_04', 'T110_05', 'T110_06', 'T110_07','FL11_01']] = [8, 3, 8, 18, 2, 4] 

# VP 15 misinterpreted NASA-TLX Frustration polarity on first three runs
data_df.loc[data_df['DF01_01']=='15', 'T103_06'] = 22 - data_df.loc[data_df['DF01_01']=='15', 'T103_06'].astype(int)
data_df.loc[data_df['DF01_01']=='15', 'T104_06'] = 22 - data_df.loc[data_df['DF01_01']=='15', 'T104_06'].astype(int)
data_df.loc[data_df['DF01_01']=='15', 'T105_06'] = 22 - data_df.loc[data_df['DF01_01']=='15', 'T105_06'].astype(int)

In [14]:
# Remove and rename columns (+ add subcolumns)
cols_to_remove = [
    'CASE',
    'SERIAL',
    'REF',
    'QUESTNNR',
    'MODE',
    'STARTED',
    'DF01_01',
    'DF01_02',
    'SA16',
    'SA16_01',
    'SA16_02',
    'SA16_03',
    'SA18',
    'SA18_01',
    'SA18_02',
    'SA18_03',
    'SA18_04',
    'SA20',
    'SA20_01',
    'TIME001',
    'TIME002',
    'TIME003',
    'TIME004',
    'TIME005',
    'TIME006',
    'TIME007',
    'TIME008',
    'TIME009',
    'TIME010',
    'TIME011',
    'TIME012',
    'TIME013',
    'TIME014',
    'TIME015',
    'TIME016',
    'TIME017',
    'TIME018',
    'TIME019',
    'TIME020',
    'TIME021',
    'TIME022',
    'TIME023',
    'TIME024',
    'TIME025',
    'TIME026',
    'TIME_SUM',
    'MAILSENT',
    'LASTDATA',
    'FINISHED',
    'Q_VIEWER',
    'LASTPAGE',
    'MAXPAGE',
    'MISSING',
    'MISSREL',
    'TIME_RSI',
    'DEG_TIME',
]
new_cols = {
    'AS01_01': 'AS_01',
    'AS01_02': 'AS_02',
    'AS01_03': 'AS_03',
    'AS01_04': 'AS_04',
    'AS01_05': 'AS_05',
    'AS01_06': 'AS_06',
    'AS01_07': 'AS_07',
    'AS01_08': 'AS_08',
    'AS01_09': 'AS_09',
    'AT02_01': 'ATI_01',
    'AT02_02': 'ATI_02',
    'AT02_03': 'ATI_03',
    'AT02_04': 'ATI_04',
    'AT02_05': 'ATI_05',
    'AT02_06': 'ATI_06',
    'AT02_07': 'ATI_07',
    'AT02_08': 'ATI_08',
    'AT02_09': 'ATI_09',
    'C103_01': 'CGAS_01',
    'C103_02': 'CGAS_02',
    'C103_03': 'CGAS_03',
    'C103_04': 'CGAS_04',
    'C103_05': 'CGAS_05',
    'C103_06': 'CGAS_06',
    'C103_07': 'CGAS_07',
    'C103_08': 'CGAS_08',
    'C103_09': 'CGAS_09',
    'C103_10': 'CGAS_10',
    'C103_11': 'CGAS_11',
    'C103_12': 'CGAS_12',
    'C103_13': 'CGAS_13',
    'C103_14': 'CGAS_14',
    'C103_15': 'CGAS_15',
    'C103_16': 'CGAS_16',
    'C103_17': 'CGAS_17',
    'DQ02_01': 'DBQ_01',
    'DQ02_02': 'DBQ_02',
    'DQ02_03': 'DBQ_03',
    'DQ02_04': 'DBQ_04',
    'DQ02_05': 'DBQ_05',
    'DQ02_06': 'DBQ_06',
    'DQ02_07': 'DBQ_07',
    'DQ02_08': 'DBQ_08',
    'DQ02_09': 'DBQ_09',
    'DQ02_10': 'DBQ_10',
    'DQ02_11': 'DBQ_11',
    'DQ02_12': 'DBQ_12',
    'DQ02_13': 'DBQ_13',
    'DQ02_14': 'DBQ_14',
    'DQ02_15': 'DBQ_15',
    'DQ02_16': 'DBQ_16',
    'DQ02_17': 'DBQ_17',
    'DQ02_18': 'DBQ_18',
    'DQ02_19': 'DBQ_19',
    'DQ02_20': 'DBQ_20',
    'DQ02_21': 'DBQ_21',
    'DQ02_22': 'DBQ_22',
    'DQ02_23': 'DBQ_23',
    'DQ02_24': 'DBQ_24',
    'FL04_01': 'FL_01',
    'FL05_01': 'FL_02',
    'FL06_01': 'FL_03',
    'FL07_01': 'FL_04',
    'FL08_01': 'FL_05',
    'FL09_01': 'FL_06',
    'FL10_01': 'FL_07',
    'FL11_01': 'FL_08',
    'SA01_01': 'DEM_01',
    'SA02': 'DEM_02',
    'SA03': 'DEM_03',
    'SA04': 'DEM_04',
    'SA04_01': 'DEM_05',
    'SA04_02': 'DEM_06',
    'SA05': 'DEM_07',
    'SA05_09': 'DEM_08',
    'SA07': 'DEM_09',
    'SA22_01': 'VGF_01',
    'SA22_02': 'VGF_02',
    'SA22_03': 'VGF_03',
    'SA22_04': 'VGF_04',
    'SA22_05': 'VGF_05',
    'SA22_06': 'VGF_06',
    'SA22_07': 'VGF_07',
    'SA22_08': 'VGF_08',
    'SA22_09': 'VGF_09',
    'SA22_10': 'VGF_10',
    'SA22_11': 'VGF_11',
    'SA08_01': 'DEM_10',
    'SA09': 'DEM_11',
    'SA10_01': 'DEM_12',
    'SA11': 'DEM_13',
    'SA12': 'DEM_14',
    'SA12_01': 'DEM_15',
    'SU02_01': 'SUS_01',
    'SU02_02': 'SUS_02',
    'SU02_03': 'SUS_03',
    'SU02_04': 'SUS_04',
    'SU02_05': 'SUS_05',
    'SU02_06': 'SUS_06',
    'SU02_07': 'SUS_07',
    'SU02_08': 'SUS_08',
    'SU02_09': 'SUS_09',
    'SU02_10': 'SUS_10',
    'T103_01': 'NASA-TLX_01_01',
    'T103_04': 'NASA-TLX_01_02',
    'T103_05': 'NASA-TLX_01_03',
    'T103_06': 'NASA-TLX_01_04',
    'T103_07': 'NASA-TLX_01_05',
    'T104_01': 'NASA-TLX_02_01',
    'T104_04': 'NASA-TLX_02_02',
    'T104_05': 'NASA-TLX_02_03',
    'T104_06': 'NASA-TLX_02_04',
    'T104_07': 'NASA-TLX_02_05',
    'T105_01': 'NASA-TLX_03_01',
    'T105_04': 'NASA-TLX_03_02',
    'T105_05': 'NASA-TLX_03_03',
    'T105_06': 'NASA-TLX_03_04',
    'T105_07': 'NASA-TLX_03_05',
    'T106_01': 'NASA-TLX_04_01',
    'T106_04': 'NASA-TLX_04_02',
    'T106_05': 'NASA-TLX_04_03',
    'T106_06': 'NASA-TLX_04_04',
    'T106_07': 'NASA-TLX_04_05',
    'T107_01': 'NASA-TLX_05_01',
    'T107_04': 'NASA-TLX_05_02',
    'T107_05': 'NASA-TLX_05_03',
    'T107_06': 'NASA-TLX_05_04',
    'T107_07': 'NASA-TLX_05_05',
    'T108_01': 'NASA-TLX_06_01',
    'T108_04': 'NASA-TLX_06_02',
    'T108_05': 'NASA-TLX_06_03',
    'T108_06': 'NASA-TLX_06_04',
    'T108_07': 'NASA-TLX_06_05',
    'T109_01': 'NASA-TLX_07_01',
    'T109_04': 'NASA-TLX_07_02',
    'T109_05': 'NASA-TLX_07_03',
    'T109_06': 'NASA-TLX_07_04',
    'T109_07': 'NASA-TLX_07_05',
    'T110_01': 'NASA-TLX_08_01',
    'T110_04': 'NASA-TLX_08_02',
    'T110_05': 'NASA-TLX_08_03',
    'T110_06': 'NASA-TLX_08_04',
    'T110_07': 'NASA-TLX_08_05'
}
sub_cols = [
    'Useful/Useless',
    'Pleasant/Unpleasant',
    'Bad/Good',
    'Nice/Annoying',
    'Effective/Superfluous',
    'Irritating/Likeable',
    'Assisting/Worthless',
    'Undesirable/Desirable',
    'Raising Alertness/Sleep-inducing' ,
    'I like to occupy myself in greater detail with technical systems.',
    'I like testing the functions of new technical systems.',
    'I predominantly deal with technical systems because I have to.',
    'When I have a new technical system in front of me, I try it out intensively.' ,
    'I enjoy spending time becoming acquainted with a new technical system.',
    'It is enough for me that a technical system works; I don’t care how or why.' ,
    'I try to understand how a technical system exactly works.',
    'It is enough for me to know the basic functions of a technical system.' ,
    'I try to make full use of the capabilities of a technical system.',
    'I am good at playing computer games.',
    'Playing computer games is easy for me.',
    'I understand and play computer games well.',
    'I am skilled at playing computer games.',
    'I like taking courses that use computers.',
    'Using computer games in school is a good way to learn.',
    'Playing computer games improves my eye and hand coordination.' ,
    'Playing computer games enhances my imagination.',
    'I like it when people talk about computer games.',
    'I feel comfortable while playing computer games.',
    'I am very interested in solving quest/questions/missions in computer games.',
    'I always try to solve the current quest/question/mission in the computer game.',
    'Playing computer games makes me happy.',
    'Playing computer games is part of my life.',
    'When I have free time, I play computer games.',
    'I talk about computer games with my friends.',
    'I am not alone in a computer game as I can make friends there.',
    'You try to start at the traffic lights in the wrong gear.',
    'You are annoyed by a slow vehicle driving on the left on the highway and overtake it on the right.',
    'You drive close to a vehicle in front of you to signal to the driver that he should drive faster or leave your lane.',
    'You try to overtake someone and do not notice that he is already signaling left and wants to turn.',
    'You have forgotten where you parked the car in the parking garage or parking lot.',
    'You accidentally press a switch (e.g. for the indicator) even though you actually wanted to press another one (e.g. for the windshield wipers).',
    'You realize that you don’t actually know exactly what the route you have just driven looked like.',
    'You drive through a traffic light even though you know that you should actually stop.',
    'You fail to notice pedestrians crossing the road when turning.',
    'You get annoyed with another driver and chase after them to show them what you think of them.',
    'You take the wrong exit at a roundabout.',
    'You ignore speed limits at night or when traffic is light.',
    'You turn right and almost collide with a cyclist going straight ahead.',
    'You turn into a priority road so attentively that you almost hit the car in front of you in your lane.',
    'You drive despite knowing that you may have drunk more alcohol than the legal limit.',
    'You dislike a certain type of driver and you show it to them whenever you can.',
    'You underestimate the speed of an oncoming vehicle when overtaking.',
    'You back into something you didn’t see before.',
    'You want to drive to A and suddenly realize that you are on the way to B, e.g. because you usually drive to B.',
    'You get into the wrong lane at an intersection.',
    'You overlook a "give way" sign and almost collide with a road user who has the right of way.',
    'You fail to look in the rear-view mirror when changing lanes, before getting out of the car, etc.',
    'You get involved in races with other drivers.',
    'You brake too sharply on a slippery road or do not steer properly, causing you to skid.',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'For me personally, the current requirements are...',
    'Age',
    'Gender',
    'Nationality',
    'Occupation / course of study',
    'Study',
    'Profession',
    'Degree',
    'Degree: Other (please specify)',
    'Car frequency',
    'Video games in general',
    'Adventure',
    'Action',
    'Casual game',
    'Indie',
    'MMO (Massively Multiplayer Online)',
    'Racing game',
    'Role-playing game',
    'Simulation',
    'Sports',
    'Strategy',
    'Car mileage (last 12 months).',
    'AV knowledge',
    'AV interest',
    'AV ride-along',
    'RO experience',
    'RO experience: Yes [Please specify]',
    'I think that I would like to use this system frequently for a job as a remote operator.',
    'I found the system unnecessarily complex.',
    'I thought the system was easy to use.',
    'I think that I would need the support of a technical person to be able to use this system.',
    'I found the various functions of the system were well integrated.',
    'I thought there was too much inconsistency in this system.',
    'I would imagine that most people would learn to use this system very quickly.',
    'I found the system very cumbersome to use.',
    'I felt very confident using the system.',
    'I needed to learn a lot of things before I could get going with this system.',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
    'How much mental and perceptual activity was required? Was the task easy or demanding, simple or complex?',
    'How successful were you in performing the task? How satisfied were you with your performance?',
    'How hard did you have to work (mentally and physically) to accomplish your level of performance?',
    'How irritated, stressed, and annoyed versus content, relaxed, and complacent did you feel during the task?',
    'How much time pressure did you feel due to the pace at which the tasks or task elements occurred? Was the pace slow or rapid?',
]

df = data_df.drop(columns=cols_to_remove).drop(0)
df = df.rename(columns=new_cols)

# Add subcolumns (?)
# df.columns = pd.MultiIndex.from_tuples([(col, sub) for col, sub in zip(df.columns, sub_cols)])

In [15]:
# Add participant_id and reorder columns
participant_ids = [1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
new_order = [
    'participant_id',

    'DEM_01',
    'DEM_02',
    'DEM_03',
    'DEM_04',
    'DEM_05',
    'DEM_06',
    'DEM_07',
    'DEM_08',
    'DEM_09',
    'DEM_10',
    'DEM_11',
    'DEM_12',
    'DEM_13',
    'DEM_14',
    'DEM_15',

    'VGF_01',
    'VGF_02',
    'VGF_03',
    'VGF_04',
    'VGF_05',
    'VGF_06',
    'VGF_07',
    'VGF_08',
    'VGF_09',
    'VGF_10',
    'VGF_11',

    'ATI_01',
    'ATI_02',
    'ATI_03',
    'ATI_04',
    'ATI_05',
    'ATI_06',
    'ATI_07',
    'ATI_08',
    'ATI_09',

    'CGAS_01',
    'CGAS_02',
    'CGAS_03',
    'CGAS_04',
    'CGAS_05',
    'CGAS_06',
    'CGAS_07',
    'CGAS_08',
    'CGAS_09',
    'CGAS_10',
    'CGAS_11',
    'CGAS_12',
    'CGAS_13',
    'CGAS_14',
    'CGAS_15',
    'CGAS_16',
    'CGAS_17',

    'DBQ_01',
    'DBQ_02',
    'DBQ_03',
    'DBQ_04',
    'DBQ_05',
    'DBQ_06',
    'DBQ_07',
    'DBQ_08',
    'DBQ_09',
    'DBQ_10',
    'DBQ_11',
    'DBQ_12',
    'DBQ_13',
    'DBQ_14',
    'DBQ_15',
    'DBQ_16',
    'DBQ_17',
    'DBQ_18',
    'DBQ_19',
    'DBQ_20',
    'DBQ_21',
    'DBQ_22',
    'DBQ_23',
    'DBQ_24',

    'FL_01',
    'FL_02',
    'FL_03',
    'FL_04',
    'FL_05',
    'FL_06',
    'FL_07',
    'FL_08',

    'NASA-TLX_01_01',
    'NASA-TLX_01_02',
    'NASA-TLX_01_03',
    'NASA-TLX_01_04',
    'NASA-TLX_01_05',
    'NASA-TLX_02_01',
    'NASA-TLX_02_02',
    'NASA-TLX_02_03',
    'NASA-TLX_02_04',
    'NASA-TLX_02_05',
    'NASA-TLX_03_01',
    'NASA-TLX_03_02',
    'NASA-TLX_03_03',
    'NASA-TLX_03_04',
    'NASA-TLX_03_05',
    'NASA-TLX_04_01',
    'NASA-TLX_04_02',
    'NASA-TLX_04_03',
    'NASA-TLX_04_04',
    'NASA-TLX_04_05',
    'NASA-TLX_05_01',
    'NASA-TLX_05_02',
    'NASA-TLX_05_03',
    'NASA-TLX_05_04',
    'NASA-TLX_05_05',
    'NASA-TLX_06_01',
    'NASA-TLX_06_02',
    'NASA-TLX_06_03',
    'NASA-TLX_06_04',
    'NASA-TLX_06_05',
    'NASA-TLX_07_01',
    'NASA-TLX_07_02',
    'NASA-TLX_07_03',
    'NASA-TLX_07_04',
    'NASA-TLX_07_05',
    'NASA-TLX_08_01',
    'NASA-TLX_08_02',
    'NASA-TLX_08_03',
    'NASA-TLX_08_04',
    'NASA-TLX_08_05',
    
    'AS_01',
    'AS_02',
    'AS_03',
    'AS_04',
    'AS_05',
    'AS_06',
    'AS_07',
    'AS_08',
    'AS_09',

    'SUS_01',
    'SUS_02',
    'SUS_03',
    'SUS_04',
    'SUS_05',
    'SUS_06',
    'SUS_07',
    'SUS_08',
    'SUS_09',
    'SUS_10',
]

df.insert(0, 'participant_id', participant_ids)
df = df[new_order]

In [16]:
# Convert values into numbers
cols_not_to_convert = [
    'DEM_05',
    'DEM_06',
    'DEM_07',
    'DEM_08',
    'DEM_09',
    'DEM_15',
]
# cols_to_convert = [col for col in df.columns if col[0] not in cols_not_to_convert]
cols_to_convert = [col for col in df.columns if col not in cols_not_to_convert]

for col in cols_to_convert:
    df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [17]:
df.head(5)

Unnamed: 0,participant_id,DEM_01,DEM_02,DEM_03,DEM_04,DEM_05,DEM_06,DEM_07,DEM_08,DEM_09,...,SUS_01,SUS_02,SUS_03,SUS_04,SUS_05,SUS_06,SUS_07,SUS_08,SUS_09,SUS_10
1,1,43,1,1,2,,Wissenschaftlicher Mitarbeiter,7,,1,...,3,1,4,1,2,4,4,2,3,2
2,2,40,2,1,2,,"Hausmann, Rentner",11,,4,...,4,2,4,2,4,2,4,2,4,2
3,4,22,2,1,1,Verkehrsingenieurwesen,,12,,4,...,4,2,4,2,4,1,5,2,4,3
4,5,42,1,1,2,,Lagerhelferin,11,,5,...,5,2,4,3,4,2,4,1,4,5
5,6,30,2,1,1,Wirtschaftsingenieur Maschinenbau,,12,,1,...,3,2,5,2,4,2,5,1,3,1


### 2. Questionnaire-based Preparation and Scoring

##### !!! NASA Task-Load-Index (NASA-TLX)

In [18]:
# Get NASA-TLX df
tlx_cols = ['participant_id'] + [col for col in df.columns.tolist() if col.startswith('NASA-TLX')]
tlx_df = df[tlx_cols]

In [19]:
# Get block type orders from GUI data
gui_ddf = data['all_gui_data']
gui_df = gui_ddf.compute()

block_orders = []
for pid in participant_ids:
    participant_gui_df = gui_df.query(f"participant_id == {pid}")
    participant_order = participant_gui_df['block_type'].unique()
    block_orders.append(participant_order)


In [20]:
# Restructure data
restructured_data = []
for i in range(len(block_orders)):
    participant_data = tlx_df.iloc[i]
    participant_id = participant_data['participant_id']
    
    block_order = block_orders[i]
    for j, block_type in enumerate(block_order):
        block_data = participant_data[[f'NASA-TLX_0{j+1}_0{k}' for k in range(1, 6)]].values
        block_data_list = [int(val) if not pd.isna(val) else np.nan for val in block_data]
        new_row = [int(participant_id), int(j+1), int(block_type)] + block_data_list
        restructured_data.append(new_row)

columns = ['participant_id', 'block_id', 'block_type'] + [f'NASA_TLX_{k:02d}' for k in range(1, 6)]
quest_df = pd.DataFrame(restructured_data, columns=columns)

In [21]:
# Compute scoring
adjusted_values = quest_df.iloc[:, 3:8] - 1   #subtract 1
adjusted_values.iloc[:, 1] = 20 - adjusted_values.iloc[:, 1]  # Reversing the values
mean_adjusted_values = adjusted_values.mean(axis=1)
quest_df['NASA_TLX_Score'] = mean_adjusted_values * 5

##### !!! NASA Task-Load-Index (NASA-TLX) [no temporal demand]

In [22]:
# Compute scoring
adjusted_values2 = quest_df.iloc[:, 3:7] - 1   #subtract 1
adjusted_values2.iloc[:, 1] = 20 - adjusted_values2.iloc[:, 1]  # Reversing the values
mean_adjusted_values2 = adjusted_values2.mean(axis=1)
quest_df['NASA_TLX_Score2'] = mean_adjusted_values2 * 5

##### !!! NASA Task-Load-Index (NASA-TLX) [only cognitive demand]

In [23]:
# Compute scoring
cogdem_values = quest_df.iloc[:, 3] - 1   #subtract 1
quest_df['NASA_TLX_Score3'] = cogdem_values * 5

##### !!! Flow State (FL) *

In [24]:
# Get Flow State question df
flow_cols = ['participant_id'] + [col for col in df.columns.tolist() if col.startswith('FL_')]
flow_df = df[flow_cols]

In [25]:
# Get block type orders from GUI data
gui_ddf = data['all_gui_data']
gui_df = gui_ddf.compute()

block_orders = []
for pid in participant_ids:
    participant_gui_df = gui_df.query(f"participant_id == {pid}")
    participant_order = participant_gui_df['block_type'].unique()
    block_orders.append(participant_order)


In [26]:
# Restructure data
flow_scores = []
for i in range(len(block_orders)):
    participant_data = flow_df.iloc[i]
    block_order = block_orders[i]
    
    for j, block_type in enumerate(block_order):
        flow_value = participant_data[f'FL_0{j+1}']
        flow_value = int(flow_value) if not pd.isna(flow_value) else np.nan
        flow_scores.append(flow_value)

In [27]:
# Add to questionnaire data
flow_scores_array = np.array(flow_scores)
quest_df['FL_Score'] = flow_scores_array

##### Videogame Frequency (VGF) (?) *

##### Affinity for Technology Interaction (ATI)

In [28]:
# Get ATI df
ati_cols = ['participant_id'] + [col for col in df.columns.tolist() if col.startswith('ATI_')]
ati_df = df[ati_cols]

In [29]:
# Compute scoring
positive_items = ['ATI_01', 'ATI_02', 'ATI_04', 'ATI_05', 'ATI_07', 'ATI_09']
negative_items = ['ATI_03', 'ATI_06', 'ATI_08']

ati_df[positive_items] = ati_df[positive_items].apply(lambda x: x-1)    #subtract 1
ati_df[negative_items] = ati_df[negative_items].apply(lambda x: 5 - x)  #subtract from 5
ati_score = ati_df.iloc[:, 1:].mean(axis=1) * 25
ati_score = ati_score.round(2)

ati_df['ATI_Score'] = ati_score
df['ATI_Score'] = ati_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ati_df[positive_items] = ati_df[positive_items].apply(lambda x: x-1)    #subtract 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ati_df[negative_items] = ati_df[negative_items].apply(lambda x: 5 - x)  #subtract from 5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ati_df['ATI_Score'] = ati_score


In [30]:
# Cronbach's alpha calculation
items = ati_df.iloc[:, 1:10] 
n_items = len(items.columns)
item_variances = items.var(axis=0, ddof=1)
total_variance = items.sum(axis=1).var(ddof=1)
ati_alpha = n_items / (n_items - 1) * (1 - item_variances.sum() / total_variance)
print(f"Cronbach's Alpha : {ati_alpha:.2f}")

Cronbach's Alpha : 0.86


##### Computer Games Attitude Scale (CGAS) [skip]

##### Driver Behavior Questonnaire (DBQ) *

##### Acceptance Scale (AS)

In [31]:
# Get ATI df
as_cols = ['participant_id'] + [col for col in df.columns.tolist() if col.startswith('AS_')]
as_df = df[as_cols]

In [32]:
# Compute scoring
positive_items = ['AS_03', 'AS_06', 'AS_08']
negative_items = ['AS_01', 'AS_02', 'AS_04', 'AS_05', 'AS_07', 'AS_09']

as_df[positive_items] = as_df[positive_items].apply(lambda x: x-1)    #subtract 1
as_df[negative_items] = as_df[negative_items].apply(lambda x: 5 - x)  #subtract from 5
as_score = as_df.iloc[:, 1:].mean(axis=1) * 25
as_score = as_score.round(2)

as_df['AS_Score'] = as_score
df['AS_Score'] = as_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  as_df[positive_items] = as_df[positive_items].apply(lambda x: x-1)    #subtract 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  as_df[negative_items] = as_df[negative_items].apply(lambda x: 5 - x)  #subtract from 5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  as_df['AS_Score'] = as_score
  df['A

##### System Usability Scale (SUS)

In [33]:
# Get SUS df
sus_cols = [
    'participant_id',
    'SUS_01',
    'SUS_02',
    'SUS_03',
    'SUS_04',
    'SUS_05',
    'SUS_06',
    'SUS_07',
    'SUS_08',
    'SUS_09',
    'SUS_10',
]
sus_df = df[sus_cols]

In [34]:
# Compute SUS score
odd_items = ['SUS_01', 'SUS_03', 'SUS_05', 'SUS_07', 'SUS_09']
even_items = ['SUS_02', 'SUS_04', 'SUS_06', 'SUS_08', 'SUS_10']
sus_df['odd_sum'] = sus_df[odd_items].sum(axis=1) - len(odd_items)           #subtract 1
sus_df['even_sum'] = 5 * len(even_items) - sus_df[even_items].sum(axis=1)    #subtract from 5
sus_score = (sus_df['odd_sum'] + sus_df['even_sum']) * 2.5                          #sum and multiply by 2.5
sus_df = sus_df.drop(columns=['odd_sum', 'even_sum'])

sus_df['SUS_Score'] = sus_score
df['SUS_Score'] = sus_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_df['odd_sum'] = sus_df[odd_items].sum(axis=1) - len(odd_items)           #subtract 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sus_df['even_sum'] = 5 * len(even_items) - sus_df[even_items].sum(axis=1)    #subtract from 5
  df['SUS_Score'] = sus_score


### Export Relevant Questionnaire Data

In [35]:
quest_df

Unnamed: 0,participant_id,block_id,block_type,NASA_TLX_01,NASA_TLX_02,NASA_TLX_03,NASA_TLX_04,NASA_TLX_05,NASA_TLX_Score,NASA_TLX_Score2,NASA_TLX_Score3,FL_Score
0,1,1,4,11,19,11,2,,28.75,28.75,50,
1,1,2,1,8,19,11,2,,25.00,25.00,35,
2,1,3,2,12,19,12,2,,31.25,31.25,55,
3,1,4,3,14,17,15,4,,42.50,42.50,65,
4,1,5,5,9,20,9,2,,22.50,22.50,40,
...,...,...,...,...,...,...,...,...,...,...,...,...
147,20,4,3,7,18,6,2,2.0,16.00,18.75,30,5.0
148,20,5,8,14,17,13,3,5.0,35.00,38.75,65,5.0
149,20,6,7,16,15,16,4,1.0,39.00,48.75,75,6.0
150,20,7,6,16,11,13,4,4.0,43.00,50.00,75,5.0


In [36]:
# Export to csv file
quest_df.to_csv('./aggregated_data/all_quest_data.csv')
quest_df.to_csv('./cleaned_data/quest_data.csv')
quest_df.to_csv('./aoi_data/quest.csv')