# Analysis

## Environment Preparation

In [14]:
# import libraries
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce
from datetime import datetime

In [15]:
# Function to swap 'lose' with 'win' and vice versa
def swap_lose_win(identifier):
    if 'lose' in identifier:
        return identifier.replace('lose', 'win')
    elif 'win' in identifier:
        return identifier.replace('win', 'lose')
    return identifier

# Function to rewrite choice location
def map_choice_location(value):
    if value == 'g' or value == '-0.3':
        return 'left'
    elif value == 'k' or value == '0.3':
        return 'right'
    else:
        return 'unknown'
    
# Function to parse the custom datetime format
def parse_custom_datetime(dt_str):
    return datetime.strptime(dt_str, '%Y-%m-%d_%Hh%M.%S.%f')

## Data Preparation

In [16]:
# Reading the files and adding na columns in case columns are missing
file_names = ["fresh_data.csv"]
directory = '/Users/anja/Desktop/data_mt/'

desired_columns = ['ident_block_trial', # identifier to determine trial, because dataset is off
                    'yc_resp.keys', #keys for color choice in yc condition
                    'too_slow_choice.started', # in yc and cc condition when selecting or confirming color
                    'wrong_answer_choice.started', # in yc and cc condition when selecting or confirming color
                    'participant', # participant id
                    'session', # session id
                    'date', # date when data collection happened
                    'owner_confirm_2.keys', # yc owner confirmation with which key?,
                    'owner_confirm_2.rt', # relevant reaction time
                    'block', # number of block
                    'trial', # number of trial in block
                    'chooser', # cc or yc trial?
                    'stim1', # left color square
                    'stim2', # right color square
                    'choice_frame_location', # cozmos color choice
                    'owner', # predefined outcome owner
                    'value', # predefined outcome value
                    'value_distribution', # identifies chance trials
                    'identifier_chooser_owner_value', # identifier of condition set, must be corrected for chance trials
                    'choice_confirmation.keys', # in cc condition what key was used to confirm color choice
                    'owner_confirm.keys', # in cc condition what was confirmed
                    'owner_confirm.rt', # relevant reaction time
                    'wrong_answer.started', # in cc and yc condition when confirming owner
                    'too_slow.started', # in cc and yc condition when confirming owner
                    ] 

# Initialize an empty list to store DataFrames
dfs = []

# Read the csv files and create columns in case some are missing
for file_name in file_names:
    file_path = os.path.join(directory, file_name)
    
    # Get the available columns
    df_initial = pd.read_csv(file_path, nrows=0) 
    available_columns = df_initial.columns.tolist()

    # Determine which desired columns are available in csv
    existing_columns = [col for col in desired_columns if col in available_columns]

    # Read the CSV file, using only the existing columns
    df = pd.read_csv(file_path, usecols=existing_columns)

    # Add the missing columns with empty values
    for col in desired_columns:
        if col not in df.columns:
            df[col] = pd.NA
    
    # Append the DataFrame to the list of dataframes
    dfs.append(df)

# Concatenating files together
combined_df = pd.concat(dfs, ignore_index=True)

In [17]:
# Creating subsets of data to perform left join
first_subset = df[['ident_block_trial','participant','session','date']].drop_duplicates() # auto create
second_subset = df[['ident_block_trial','block','trial','chooser','stim1','stim2','choice_frame_location','owner','value','value_distribution','identifier_chooser_owner_value']].dropna() # condition file
third_subset = df[['ident_block_trial','yc_resp.keys']].dropna() # you choose color choice
fourth_subset = df[['ident_block_trial', 'owner_confirm_2.keys', 'owner_confirm_2.rt']].dropna() # yc owner confirmation
fifth_subset = df[['ident_block_trial','choice_confirmation.keys']].dropna() # cc chooses color choice confirmation
sixth_subset = df[['ident_block_trial','owner_confirm.keys', 'owner_confirm.rt']].dropna() # cc chooses owner confirmation
seventh_subset = df[['ident_block_trial','wrong_answer.started']].dropna()
eighth_subset = df[['ident_block_trial','too_slow.started']].dropna()
nineth_subset = df[['ident_block_trial','too_slow_choice.started']].dropna()
tenth_subset = df[['ident_block_trial','wrong_answer_choice.started']].dropna()

In [18]:
# Merging dataframes together so that there is only one line per trial
dfs = [first_subset, second_subset, third_subset, fourth_subset, fifth_subset, sixth_subset, seventh_subset, eighth_subset, nineth_subset, tenth_subset]
merged_df = reduce(lambda left, right: pd.merge(left, right, on=['ident_block_trial', 'participant'], how='left'), dfs)

In [19]:
# Detele practice trials
merged_df = merged_df[merged_df['block'] != 'Test']

In [20]:
# Putting the reaction times in one column
merged_df['owner_confirm_rt'] = merged_df['owner_confirm_2.rt'].combine_first(merged_df['owner_confirm.rt'])

# Putting the owner confirm keys in one column
merged_df['owner_confirm_keys'] = merged_df['owner_confirm_2.keys'].combine_first(merged_df['owner_confirm.keys'])

# Putting choice_frame_location and yc_resp.keys together and change their values in left and right
merged_df['choice_location'] = merged_df['yc_resp.keys'].combine_first(merged_df['choice_frame_location'])
merged_df['choice_location'] = merged_df['choice_location'].apply(map_choice_location)

In [21]:
# Create bool columns for too slow and wrong responses
merged_df['bool_wrong_color_confirm'] = ~merged_df['wrong_answer_choice.started'].isna()
merged_df['bool_slow_color_choice_or_confirm'] = ~merged_df['too_slow_choice.started'].isna() # for yc and cc same column used
merged_df['bool_wrong_owner_confirm'] = ~merged_df['wrong_answer.started'].isna()
merged_df['bool_slow_owner_confirm'] = ~merged_df['too_slow.started'].isna()

In [22]:
# Correct identifier for random samples (10%) in the original cleaned_df
merged_df.loc[merged_df['value_distribution'] == 10, 'identifier_chooser_owner_value'] = merged_df.loc[merged_df['value_distribution'] == 10, 'identifier_chooser_owner_value'].apply(swap_lose_win)

In [23]:
# rename identifier column, stim1, stim2, choice_confirmation.keys, 
new_names = {'stim1': 'left_color', 'stim2': 'right_color', 'choice_confirmation.keys': 'choice_confirm_keys', 'identifier_chooser_owner_value':'identifier_chooser_owner_value_corr'}
merged_df = merged_df.rename(columns=new_names)

In [24]:
# Create date and time column from date
# Apply the parsing function to the 'combined_datetime' column
merged_df['parsed_datetime'] = merged_df['date'].apply(parse_custom_datetime)

# Extract the date and time components
merged_df['date_new'] = merged_df['parsed_datetime'].dt.date
merged_df['time'] = merged_df['parsed_datetime'].dt.time


In [25]:
relevant_columns = ['date_new', # date when data collection happened
                    'time',
                    'session', # session id
                    'participant', # participant id
                    'block', # number of block
                    'trial', # number of trial in block
                    'identifier_chooser_owner_value_corr', # distribution corrected identifier from: identifier_chooser_owner_value
                    'chooser', # cc or yc trial?
                    'left_color', # left color square
                    'right_color', # right color square
                    'choice_location',# from 'choice_frame_location' and 'yc_resp.keys'
                    'choice_confirm_keys', # from .keys; in cc condition what key was used to confirm color choice
                    'value', # predefined outcome value
                    'value_distribution', # identifies chance trials
                    'owner', # predefined outcome owner
                    'owner_confirm_keys', 
                    'owner_confirm_rt', # relevant reaction time
                    'bool_slow_color_choice_or_confirm',
                    'bool_wrong_color_confirm', 
                    'bool_slow_owner_confirm',
                    'bool_wrong_owner_confirm'
                    ] 

In [26]:
merged_df = merged_df[relevant_columns]

In [33]:
# Create index column
merged_df['index_corr'] = merged_df.groupby('participant').cumcount() + 1

## Data Cleaning

In [35]:
# Clean from rows where response was too slow or wrong
data_df = merged_df[(merged_df['bool_slow_color_choice_or_confirm'] == False) & 
                      (merged_df['bool_wrong_color_confirm'] == False) & 
                      (merged_df['bool_slow_owner_confirm'] == False) & 
                      (merged_df['bool_wrong_owner_confirm'] == False)]

In [28]:
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 64 entries, 4 to 67
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   date_new                             64 non-null     object 
 1   time                                 64 non-null     object 
 2   session                              64 non-null     int64  
 3   participant                          64 non-null     int64  
 4   block                                64 non-null     object 
 5   trial                                64 non-null     float64
 6   identifier_chooser_owner_value_corr  64 non-null     object 
 7   chooser                              64 non-null     object 
 8   left_color                           64 non-null     object 
 9   right_color                          64 non-null     object 
 10  choice_location                      64 non-null     object 
 11  choice_confirm_keys                  32

## Analysis

In [29]:
# Mean age, handedness, gender?

In [30]:
# How many trials per particpant do we have? How many got sorted out in total? 

# How many trials per condition group?

In [31]:
# Normal distribution of reaction time? Any outliers?

In [32]:
# Reaction time per condition group #TODO:change to violin plot when more data is available https://python-graph-gallery.com/39-hidden-data-under-boxplot/
plt.figure(figsize=(15, 6)) 
sns.boxplot( x=cleaned_df["identifier_chooser_owner_value"], y=cleaned_df["reaction_time"] )

KeyError: 'reaction_time'

<Figure size 1500x600 with 0 Axes>

In [None]:
# how does reaction time develope over time? learing effects? do get people faster? 

In [None]:
# do people prefer one choice? i mean by side? 