# Imports

In [1]:
import pandas as pd
import os
import shutil

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Open all of the relevant data files/directories

**Note**: Remember to download the latest data and qualtrics surveys! Emily last downloaded Qualtrics and data at the end of 4/12

Qualtrics export settings:

<img src="qualtrics_export_settings.png" alt="Qualtrics Export Settings" width="500px">

In [2]:
data_dir = 'C:/Users/Emily Jensen/OneDrive - UCB-O365/Drone Feedback Data/data/'

conditions_file = data_dir + 'conditions.csv'
conditions = pd.read_csv(conditions_file)

exit_survey_file = data_dir + 'exit_survey.csv'
exit_survey = pd.read_csv(exit_survey_file)

# Data Cleaning

## Set participant info data types and remove test data

In [3]:
# list each column and its data type
print(conditions.dtypes)

time         object
user_id      object
condition    object
dtype: object


In [4]:
# make time column a datetime object
conditions['time'] = pd.to_datetime(conditions['time'])

# make condition column a category
conditions['condition'] = conditions['condition'].astype('category')

print(conditions.dtypes)

time         datetime64[ns]
user_id              object
condition          category
dtype: object


In [5]:
# remove entries that happened before 11:15am on 2024-04-10
# this is when we launched the first experiment condition
conditions = conditions[conditions['time'] >= '2024-04-10 11:15:00']

# remove any entries where user_id includes 'emily'
conditions = conditions[~conditions['user_id'].str.contains('emily')]

print(len(conditions))

191


In [6]:
# looks like we retain just prolific ids
print(conditions['user_id'].unique())

['5f84d512acba571a2bdda680' '5760a995f371330006a47cb3'
 '5eb4955dab41e130f24d08c9' '640156a5f2395bf80ca0c451'
 '6413599c09e145dd93c7aeac' '5f2dc46d6fa1250ee8a1a15d'
 '60fde5e29e585481874a9d16' '6526a14c3f9823cccedb7687'
 '62d1228fcd446896ce7c9ec0' '63e688adbc8788de6d596c93'
 '61036bbd791964fafe65236a' '63d79e5ecdcf4d0dbd646bf6'
 '5dccb82a76eab294aa4837ff' '5d34d17089232600011ade3a'
 '6113a1bd2592fc45dff695a2' '6108614d6f2cdb85bd396d6e'
 '56bae08f30d6b30005f8537a' '612e41fb25de530ea83df0bc'
 '5f888877136ad50208b48b47' '65cba99c92b362b45e414da7'
 '62c50bf9b7587ff5073cd7fb' '63b6dfb29118fec2d923f8c3'
 '654d0bd1f4ba143e0503a02f' '5f5fa5d24b9f98028f4090e7'
 '5dce3ccc32ccbf0cd54263db' '611291090e626fdfde536f38'
 '62e185484154c451882a8a3d' '60d76140337e60ae26f7fce9'
 '6333c1f5756acfabfde457ed' '60b7bd75af8c92afa748324f'
 '60255901704fd208ecdcf32b' '63626a68cf44b4184483c8e8'
 '614e664d1657383cbf801e52' '610803b9ce5f71efbd6e1722'
 '5700be5c8a49c7000e0c768f' '5d9b866189c03c001540eff1'
 '626966a1

In [7]:
qualtrics_columns = {
    'StartDate': 'start_date',
    'EndDate': 'end_date',
    'Duration (in seconds)': 'duration',
    'Finished': 'is_finished', # 1 is finished
    'Q9': 'prolific_id', # user inputted prolific id
    'Q1': 'gender', # mapping below
    'Q8': 'age', # number input
    'Q3': 'drone_experience', # mapping below
    'Q4': 'video_game_experience', # mapping below
    'Q5': 'feedback_helped', # likert mapping below
    'Q6': 'change_from_feedback', # open text response
    'Q7': 'comments' # open text response, optional
}

# rename columns
exit_survey = exit_survey.rename(columns=qualtrics_columns)

# remove extra columns and rows
exit_survey = exit_survey.drop(columns=['Status', 'Progress', 'RecordedDate', 'ResponseId', 'DistributionChannel', 'UserLanguage'])
exit_survey = exit_survey.drop([0,1])

# set data types
exit_survey['start_date'] = pd.to_datetime(exit_survey['start_date'])
exit_survey['end_date'] = pd.to_datetime(exit_survey['end_date'])
exit_survey['duration'] = pd.to_numeric(exit_survey['duration'])
exit_survey['is_finished'] = exit_survey['is_finished'].astype(bool)
exit_survey['age'] = pd.to_numeric(exit_survey['age'])

In [8]:
# remove entries that happened before 11:15am on 2024-04-10
# this is when we launched the first experiment condition
exit_survey = exit_survey[exit_survey['start_date'] >= '2024-04-10 11:15:00']

# remove any entries where prolific_id includes 'emily'
exit_survey = exit_survey[~exit_survey['prolific_id'].str.contains('emily')]

print(len(exit_survey))

127


In [9]:
# looks like we retain just prolific ids
print(exit_survey['prolific_id'].unique())

['5eb4955dab41e130f24d08c9' '640156a5f2395bf80ca0c451'
 '5760a995f371330006a47cb3' '5f2dc46d6fa1250ee8a1a15d'
 '60fde5e29e585481874a9d16' '62d1228fcd446896ce7c9ec0'
 '6526a14c3f9823cccedb7687' '63e688adbc8788de6d596c93'
 '5f888877136ad50208b48b47' '5d34d17089232600011ade3a'
 '62c50bf9b7587ff5073cd7fb' '63d79e5ecdcf4d0dbd646bf6'
 '60d76140337e60ae26f7fce9' '6333c1f5756acfabfde457ed'
 '610803b9ce5f71efbd6e1722' '60b7bd75af8c92afa748324f'
 '63626a68cf44b4184483c8e8' '63b6dfb29118fec2d923f8c3'
 '5dccb82a76eab294aa4837ff' '5efcaaee74e56207d16db007'
 '5dce3ccc32ccbf0cd54263db' '5d9b866189c03c001540eff1'
 '5f513339b2c26c338771f1d0' '614e664d1657383cbf801e52'
 '60255901704fd208ecdcf32b' '5de6eca0a91be366cbb5ffe2'
 '643b53b6aeaabf186d24e099' '5b75b2ebc5e14d00013669fc'
 '62d9e46fded6a6209a518499' '63ed0ed001893f9028e06e41'
 '5a6cc406d5d4cb0001d664aa' '629658baad2881aba974c6c3'
 '5c4684826a7dbc00017c0a87' '56bae08f30d6b30005f8537a'
 '65cba99c92b362b45e414da7' '5b824fb6cc06660001a302d4'
 '60580179

In [10]:
# process categorical data
exit_survey['gender'] = pd.Categorical(exit_survey['gender'])

drone_map = {'I have never flown a drone': "None", 
             'I have tried flying a drone a few times': "Some", 
             'I regularly fly drones': "Regularly", 
             'I am an expert or professional drone pilot': "Professional"}
exit_survey['drone_experience'] = exit_survey['drone_experience'].replace(drone_map)
exit_survey['drone_experience'] = pd.Categorical(exit_survey['drone_experience'], categories=['None', 'Some', 'Regularly', 'Professional'], ordered=True)

game_map = {'I do not play video games': "None", 
            'I play video games at least once per month': "Monthly", 
            'I play video games at least once per week': "Weekly", 
            'I play video games almost every day': "Daily"}
exit_survey['video_game_experience'] = exit_survey['video_game_experience'].replace(game_map)
exit_survey['video_game_experience'] = pd.Categorical(exit_survey['video_game_experience'], categories=['None', 'Monthly', 'Weekly', 'Daily'], ordered=True)

likert_map = {'1 - Strongly Disagree': "Strongly Disagree", 
              '2': "Disagree", 
              '3 - Neither agree nor disagree': "Neutral", 
              '4': "Agree", 
              '5 - Strongly Agree': "Strongly Agree"}
exit_survey['feedback_helped'] = exit_survey['feedback_helped'].replace(likert_map)
exit_survey['feedback_helped'] = pd.Categorical(exit_survey['feedback_helped'], categories=['Strongly Disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree'], ordered=True)

exit_survey.head(10)

Unnamed: 0,start_date,end_date,duration,is_finished,prolific_id,gender,age,drone_experience,video_game_experience,feedback_helped,change_from_feedback,comments
3,2024-04-10 11:35:59,2024-04-10 11:59:47,1428,True,5eb4955dab41e130f24d08c9,Woman,48,,Weekly,Strongly Agree,"I did change my technique, I started going up ...",
4,2024-04-10 11:39:03,2024-04-10 12:11:03,1919,True,640156a5f2395bf80ca0c451,Man,33,,Daily,Disagree,I tried to take into account what it was sayin...,
5,2024-04-10 11:21:24,2024-04-10 12:15:07,3222,True,5760a995f371330006a47cb3,Woman,33,,Daily,Neutral,Yes. I began to tap buttons instead of holding...,
6,2024-04-10 11:58:31,2024-04-10 12:18:50,1218,True,5f2dc46d6fa1250ee8a1a15d,Woman,37,,,Neutral,It was easier for me to adjust based on my con...,
7,2024-04-10 11:59:20,2024-04-10 12:21:20,1320,True,60fde5e29e585481874a9d16,Non-binary,41,,Monthly,Agree,i figured out how to slow the drone down befor...,
8,2024-04-10 12:03:02,2024-04-10 12:24:52,1310,True,62d1228fcd446896ce7c9ec0,Man,31,,Weekly,Neutral,No,no
9,2024-04-10 12:00:10,2024-04-10 12:34:04,2034,True,6526a14c3f9823cccedb7687,Woman,46,,,Strongly Agree,Showing where I overcompensated and suggesting...,"This was fun, thanks. If this will actually be..."
10,2024-04-10 12:07:18,2024-04-10 12:38:48,1889,True,63e688adbc8788de6d596c93,Woman,45,,Monthly,Agree,The feedback helped me know what to work on to...,
11,2024-04-10 12:20:59,2024-04-10 12:43:45,1366,True,5f888877136ad50208b48b47,Man,23,,Monthly,Agree,Feedbacks were very similar,
12,2024-04-10 12:12:43,2024-04-10 12:44:32,1909,True,5d34d17089232600011ade3a,Man,35,,Weekly,Strongly Agree,"Towards the end, i started getting too confide...",No technical issues with this survey


In [11]:
len(exit_survey)

127

In [12]:
# merge conditions and exit_survey on user_id and prolific_id
# only keep rows that have a match in both dataframes
merged = pd.merge(conditions, exit_survey, left_on='user_id', right_on='prolific_id', how='inner')

# drop extra user_id column
merged = merged.drop(columns=['user_id'])

print(len(merged)) # why are there more rows than exit_survey?

146


In [15]:
# participants who show up multiple times in the merged dataframe
# this is because they restarted the experiment and have multiple entries in the conditions file
duplicate_ids = merged[merged.duplicated(subset='prolific_id', keep=False)]['prolific_id'].unique()
duplicate_ids.sort()
print(duplicate_ids)

['5bd49bcc25db7b0001794063' '5c90094e71f3100016181ea9'
 '5ef9f528c7ae587afa25fe9b' '60fcc292d13ae9614d4a77a7'
 '6105c41aa4fe602501d5a8cc' '610796f1301fccdca446af57'
 '629658baad2881aba974c6c3' '63026a8fd8429b224cd2a134'
 '631f1b608af38f654d2a3b1f' '637d4196c70a66e28ecede34'
 '63ba10de73415d047e1d6731' '643c6175d46d41e74033994f'
 '652ab7948cb59f4c50c7972a' '6596a5cad60ef105b6c18897'
 '65cba99c92b362b45e414da7']


In [17]:
# drop duplicate entries with duplicate ids
merged = merged.drop_duplicates(subset='prolific_id')
print(len(merged)) # better number!

127


In [18]:
# save merged dataframe to file
participant_file = data_dir + 'participant_info.csv'
merged.to_csv(participant_file, index=False)

Notes on participants that seem to have restarted the experiment:
- `5bd49bcc25db7b0001794063` restarted after trial 1
- `5c90094e71f3100016181ea9` restarted after trial 3 and again after trial 1
- `5ef9f528c7ae587afa25fe9b` restarted after trial 6
- `60fcc292d13ae9614d4a77a7` restarted after trial 3
- `6105c41aa4fe602501d5a8cc` restarted after trial 8
- `610796f1301fccdca446af57` restarted after trial 7
- `629658baad2881aba974c6c3` restarted after trial 2
- `63026a8fd8429b224cd2a134` restarted before completing trial 1
- `631f1b608af38f654d2a3b1f` restarted after trial 15
- `637d4196c70a66e28ecede34` restarted before completing trial 1
- `63ba10de73415d047e1d6731` restarted after trial 5 and again after trial 1
- `643c6175d46d41e74033994f` restarted after trial 8 and again after trial 1
- `652ab7948cb59f4c50c7972a` restarted after trial 1
- `6596a5cad60ef105b6c18897` restarted after trial 1
- `65cba99c92b362b45e414da7` restarted after trial 17

# Distributions of demographic data

In [22]:
# gender distribution
print(merged['gender'].value_counts())
print((merged['gender'].value_counts()/len(merged)).round(2))


gender
Woman         67
Man           53
Non-binary     7
Name: count, dtype: int64
gender
Woman         0.53
Man           0.42
Non-binary    0.06
Name: count, dtype: float64


In [23]:
# age distribution
print(merged['age'].describe().round(1))

count    127.0
mean      37.8
std       12.2
min       18.0
25%       29.0
50%       35.0
75%       47.0
max       74.0
Name: age, dtype: float64


In [25]:
# previous experience with flying drones
print(merged['drone_experience'].value_counts().sort_index())
print((merged['drone_experience'].value_counts().sort_index()/len(merged)).round(2))

drone_experience
None            103
Some             18
Regularly         5
Professional      1
Name: count, dtype: int64
drone_experience
None            0.81
Some            0.14
Regularly       0.04
Professional    0.01
Name: count, dtype: float64


In [26]:
# video game experience
print(merged['video_game_experience'].value_counts().sort_index())
print((merged['video_game_experience'].value_counts().sort_index()/len(merged)).round(2))

video_game_experience
None       22
Monthly    41
Weekly     32
Daily      32
Name: count, dtype: int64
video_game_experience
None       0.17
Monthly    0.32
Weekly     0.25
Daily      0.25
Name: count, dtype: float64


In [27]:
# overall view of feedback perception
print(merged['feedback_helped'].value_counts().sort_index())
print((merged['feedback_helped'].value_counts().sort_index()/len(merged)).round(2))

feedback_helped
Strongly Disagree    18
Disagree             14
Neutral              23
Agree                53
Strongly Agree       19
Name: count, dtype: int64
feedback_helped
Strongly Disagree    0.14
Disagree             0.11
Neutral              0.18
Agree                0.42
Strongly Agree       0.15
Name: count, dtype: float64


# Start looking at trajectory data

Grab all the images

In [None]:
# make images folder if it doesn't exist
if not os.path.exists(data_dir + 'images'):
    os.makedirs(data_dir + 'images')
    os.makedirs(data_dir + 'images/raw')
    os.makedirs(data_dir + 'images/processed')

# for each participant, save image from each trial
user_dirs = os.listdir(data_dir)
for user in conditions['user_id']:
    print(user)
    if user not in user_dirs:
        continue
    for trials in os.listdir(data_dir + user):
        if not os.path.isdir(data_dir + user + '/' + trials):
            continue
        raw_image = data_dir + user + '/' + trials + '/trajectory.png'
        processed_image = data_dir + user + '/' + trials + '/trajectory_with_feedback.png'
        # copy images to images folder
        if not os.path.exists(raw_image) or not os.path.exists(processed_image):
            continue
        shutil.copy(raw_image, data_dir + 'images/raw/' + user + '_' + trials + '_raw.png')
        shutil.copy(processed_image, data_dir + 'images/processed/' + user + '_' + trials + '_processed.png')

# Research Questions

## How do learners perceive the feedback along each dimension?

## Which feedback modality leads to higher performance improvements?