In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext rpy2.ipython

import os, sys
import pandas as pd

sys.path.insert(0, 'src')
import utils
import visualizations
import analysis

## 1. Pre-processing and Exclusions

In [2]:
conditions = ["NoVideo", "Machines", "Tools", "Companions"]

baseline_df_path = '../data/raw/baseline_raw_data.csv'
experimental_df_path = '../data/raw/experimental_raw_data.csv'

baseline_df = utils.read_file(baseline_df_path)
experimental_df = utils.read_file(experimental_df_path)

# Make 'Finished' column all upper case and select data rows that are completed
baseline_df['Finished'] = baseline_df['Finished'].str.upper()
experimental_df['Finished'] = experimental_df['Finished'].str.upper()

# Only select finished rows
baseline_df = baseline_df[baseline_df['Finished'] == 'TRUE']
experimental_df = experimental_df[experimental_df['Finished'] == 'TRUE']
# Merge baseline and experimental conditions data
df = pd.merge(baseline_df, experimental_df, how='outer')

utils.informal_log("In total: {} rows".format(len(df)))

# Print # rows in each condition
utils.informal_log("NoVideo: {} rows".format(len(baseline_df)))
utils.informal_log("Machines: {} rows".format(len(experimental_df[experimental_df['CONDITION'] == 'Mechanistic'])))
utils.informal_log("Tools: {} rows".format(len(experimental_df[experimental_df['CONDITION'] == 'Functional'])))
utils.informal_log("Companions: {} rows".format(len(experimental_df[experimental_df['CONDITION'] == 'Intentional'])))

[0624_145343] In total: 489 rows
[0624_145343] NoVideo: 124 rows
[0624_145343] Machines: 121 rows
[0624_145343] Tools: 122 rows
[0624_145343] Companions: 122 rows


## Make Save Directories

In [3]:
# Exclusion parameters
min_survey_time = round(80 / 60, 4) # ~80 seconds
min_median_per_page_time = round(1 / 60, 4) # ~1 second
post_attention_check = True

'''
Commonly used Paths/Items
'''
PATH_TO_ROOT = '..'
overwrite = False
items_path = os.path.join(PATH_TO_ROOT, 'data/files/mental_capacity_items.txt')
categories_path = os.path.join(PATH_TO_ROOT, 'data/files/category_groupings.json')
save_ext = 'pdf' # File extension for saving visualizations
separate_groups = False  # If True, create separate CSV for items of different groups for analysis

'''
Save dirs
'''

# Make save directory
save_dir = os.path.join(PATH_TO_ROOT, 'analysis')
utils.ensure_dir(save_dir)
utils.informal_log("Parent save directory: {}".format(save_dir))

# Timing save dir
time_save_dir = os.path.join(save_dir, 'timings')
utils.ensure_dir(time_save_dir)
utils.informal_log("Timing save directory: {}".format(time_save_dir))

# Demographics post-exclusions
demographics_save_dir = os.path.join(save_dir, 'demographics')
utils.ensure_dir(demographics_save_dir)
utils.informal_log("Demographics save directory: {}".format(demographics_save_dir))

# Ratings CSV save dir
ratings_save_dir = os.path.join(save_dir, 'mental_capacities')
ratings_path = os.path.join(ratings_save_dir, 'ratings.csv')
utils.ensure_dir(ratings_save_dir)
utils.informal_log("Mental capacity save directory: {}".format(ratings_save_dir))

# Rating statistics save directory
rating_stats_save_dir = os.path.join(ratings_save_dir, 'statistics')
utils.ensure_dir(rating_stats_save_dir)
utils.informal_log("Mental capacity ratings statistics save directory: {}".format(rating_stats_save_dir))

# Participant Correlation save dir
corr_save_dir = os.path.join(save_dir, 'correlations')
utils.ensure_dir(corr_save_dir)
utils.informal_log("Correlations save directory: {}".format(ratings_save_dir))

# R CSV save dir for Body Heart Mind
R_input_dir = os.path.join(ratings_save_dir, 'body-heart-mind', 'R', 'input_files')
utils.ensure_dir(R_input_dir)

# R results save dir for Body Heart Mind
R_results_save_dir = os.path.join(ratings_save_dir, 'body-heart-mind', 'R', 'results')
utils.ensure_dir(R_results_save_dir)

# Save dir for factor analysis
fa_save_dir = os.path.join(ratings_save_dir, 'factor_analysis')
utils.ensure_dir(fa_save_dir)

# Save dir for k-fold factor analysis
fa_kfold_save_dir = os.path.join(fa_save_dir, 'kfold')
utils.ensure_dir(fa_kfold_save_dir)

# Save dir for factor analysis results
fa_results_save_dir = os.path.join(fa_save_dir, 'results')
utils.ensure_dir(fa_results_save_dir)

# Save dir for R analysis based on factor loading groupings
fa_R_input_dir = os.path.join(fa_results_save_dir, 'R', 'input_files')
utils.ensure_dir(fa_R_input_dir)

# Save dir for R analysis output based on factor loading groupings
fa_R_results_dir = os.path.join(fa_results_save_dir, 'R', 'results')
utils.ensure_dir(fa_R_results_dir)

# item-level analyses with R save dir
item_level_save_dir = os.path.join(ratings_save_dir, 'item_level')
utils.ensure_dir(item_level_save_dir)

# Attitudes save dir
attitudes_save_dir = os.path.join(save_dir, 'attitudes')
utils.ensure_dir(attitudes_save_dir)

# Attitudes R CSV save dir
addit_dv_r_csv_save_dir = os.path.join(attitudes_save_dir, 'R', 'input_files')
utils.ensure_dir(addit_dv_r_csv_save_dir)

# Attitudes R results save dir
addit_dv_r_result_save_dir = os.path.join(attitudes_save_dir, 'R', 'results')
utils.ensure_dir(addit_dv_r_result_save_dir)

# Exploratory analysis save dir
exploratory_save_dir = os.path.join(save_dir, 'exploratory')

# Mentioned/Unmentioned save dir
mentioned_save_dir = os.path.join(exploratory_save_dir, 'mentioned_analysis')
utils.ensure_dir(mentioned_save_dir)

mentioned_R_csv_save_dir = os.path.join(mentioned_save_dir, 'R', 'input_files')
utils.ensure_dir(mentioned_R_csv_save_dir)

mentioned_R_result_save_dir = os.path.join(mentioned_save_dir, 'R', 'results')
utils.ensure_dir(mentioned_R_result_save_dir)

# Attitudes Correlations save dir
correlation_dir = os.path.join(exploratory_save_dir, 'correlations')
utils.ensure_dir(correlation_dir)

# Reliability save dir
reliability_save_dir = os.path.join(exploratory_save_dir, 'reliability')
utils.ensure_dir(reliability_save_dir)

[0624_145413] Parent save directory: ../analysis
[0624_145413] Timing save directory: ../analysis/timings
[0624_145413] Demographics save directory: ../analysis/demographics
[0624_145413] Mental capacity save directory: ../analysis/mental_capacities
[0624_145413] Mental capacity ratings statistics save directory: ../analysis/mental_capacities/statistics
[0624_145413] Correlations save directory: ../analysis/mental_capacities


### Timing Analysis

We utilize this information in order to filter out participants who spend too little time

In [4]:
# Data structure to map individual questions to sections of the survey
time_mapping = {
    'intro': {
        'ConsentTime1': 'consent_time',
        'PS4': 'screening_time',
        'Q10': 'instructions_time',
        'Q12': 'check_time',
    },
    'video': {
        # Videos
        'Q14': 'mech1_time',
        'Q16': 'mech2_time',
        'Q18': 'mech3_time',
        'Q20': 'func1_time',
        'Q22': 'func2_time',
        'Q24': 'func3_time',
        'Q26': 'intent1_time',
        'Q28': 'intent2_time',
        'Q30': 'intent3_time',

    },
    'video_transition': {
        # Video -> Survey Transition
        'Q32': 'takeaways_time',
        'Q34': 'transition_time',
    },
    'survey': {
        # Survey
        'Q36': 'survey_time'
    },
    'post': {
        # 'CTime1': 'confidence_time',
        'ACTime1': 'attention_check_time',
        'DVTime1': 'addit_dvs1_time',
        'DVTime2': 'addit_dvs2_time',
        'Q42': 'mech_conceptual_time',
        'DQTime1': 'demographic_time',
        'Q44': 'intent_eos_time',
        'Q46': 'eos_time'
    }
}

metadata_column_mapping = {
    'Duration (in seconds)': 'total_time',
    'CONDITION': 'condition',
    'ResponseId': 'response_id',
    'PROLIFIC_PID': 'prolific_id'
}

In [10]:
# Time analysis on all data (baseline + experimental)
time_df = analysis.time_analysis(
    df=df,
    mapping=time_mapping,
    metadata_mapping=metadata_column_mapping,
    condition='all',
    save_dir=time_save_dir,
    overwrite=overwrite)

time_stats_df = analysis.time_stats(
    time_df=time_df,
    condition='all',
    save_dir=time_save_dir,
    overwrite=overwrite)

# Time analysis on baseline data
baseline_time_df = analysis.time_analysis(
    df=baseline_df,
    mapping=time_mapping,
    metadata_mapping=metadata_column_mapping,
    condition='baseline',
    save_dir=time_save_dir,
    overwrite=overwrite)

baseline_time_stats_df = analysis.time_stats(
    time_df=baseline_time_df,
    condition='baseline',
    save_dir=time_save_dir,
    overwrite=overwrite)

# Time analysis on experimental data
experimental_time_df = analysis.time_analysis(
    df=experimental_df,
    mapping=time_mapping,
    metadata_mapping=metadata_column_mapping,
    condition='exp',
    save_dir=time_save_dir,
    overwrite=overwrite)

experimental_time_stats_df = analysis.time_stats(
    time_df=experimental_time_df,
    condition='exp',
    save_dir=time_save_dir,
    overwrite=overwrite)

# Append each user's time in raw dataframe
if 'survey_time' not in df.columns:
    df = pd.concat([df, time_df['survey']], axis=1)
    df = df.rename(columns={'survey': 'survey_time'})
    df = pd.concat([df, time_df[['median_time_per_survey_page', 'min_time_per_survey_page', 'max_time_per_survey_page']]], axis=1)

    utils.informal_log("Added time columns to DF ({} rows)".format(len(df)))
raw_df_save_path = os.path.join(save_dir, 'raw_df.csv')
utils.write_file(df, raw_df_save_path, overwrite=overwrite)

[0624_152608] Time analysis exists at ../analysis/timings/time_analysis_all_minutes.csv
[0624_152608] Timing statistics exists at ../analysis/timings/timing_stats_all_minutes.csv
[0624_152608] Time analysis exists at ../analysis/timings/time_analysis_baseline_minutes.csv
[0624_152608] Timing statistics exists at ../analysis/timings/timing_stats_baseline_minutes.csv
[0624_152608] Time analysis exists at ../analysis/timings/time_analysis_exp_minutes.csv
[0624_152608] Timing statistics exists at ../analysis/timings/timing_stats_exp_minutes.csv
Saved file to ../analysis/raw_df.csv


## Perform Exclusions
Exclude participants who meet any of the pre-registered criteria:

Fail attention check
Spend less than 80 second on mental capacity survey
Spend less than 1 second per mental capacity question

In [9]:
# Get DF post exclusions
post_exclusion_df = analysis.perform_exclusions(
    df=df,
    min_survey_time=min_survey_time,
    min_median_per_page_time=min_median_per_page_time,
    manual_exclusions=None,
    post_attention_check=post_attention_check)

exclusion_params = {
    'min_survey_time': min_survey_time,
    'min_median_per_page_time': min_median_per_page_time,
    'n_rows': len(post_exclusion_df)
}

post_exclusion_df_save_path = os.path.join(ratings_save_dir, 'post_exclusion_df.csv')
utils.write_file(post_exclusion_df, post_exclusion_df_save_path, overwrite=overwrite)

exclusion_param_save_path = os.path.join(ratings_save_dir, 'exclusion_params.json')
if os.path.exists(exclusion_param_save_path):
    utils.informal_log("Exclusion parameters already exist at {}. Meaning this directory contains some exclusion configuration. Save to another directory".format(
        exclusion_param_save_path
    ))
else:
    utils.write_file(exclusion_params, exclusion_param_save_path)

[0624_150535] 476 rows remaining post filtering for minimum of 1.3333 minutes on survey
[0624_150535] 476 rows remaining post filtering for minimum median of 0.0167 minutes on each survey page
[0624_150535] 470 rows remaining post filtering for post-survey attention check correctness
[0624_150535] No manual exclusions performed
[0624_150535] Number of rows: 470
[0624_150535] Number of rows in novideo (baseline): 118
[0624_150535] Number of rows in machines: 116
[0624_150535] Number of rows in tools: 119
[0624_150535] Number of rows in companions: 117
Saved file to ../analysis/mental_capacities/post_exclusion_df.csv
[0624_150535] Exclusion parameters already exist at ../analysis/mental_capacities/exclusion_params.json. Meaning this directory contains some exclusion configuration. Save to another directory


## Demographic Data
Obtain demographic data post exclusions

In [7]:
demographic_qs = {
    'DQ2': 'age',
    'DQ3': 'education',
    'DQ4': 'gender',
    'DQ5': 'race',
    'DQ6': 'ethnicity',
    'DQ7': 'religion',
    'DQ9': 'programming',
    'DQ10_1': 'familiarity_chatgpt',
    'DQ10_2': 'familiarity_gemini',
    'DQ10_3': 'familiarity_claude',
    'DQ10_4': 'familiarity_copilot',
    'DQ10_5': 'familiarity_replika',
    'DQ10_6': 'familiarity_nomi',
    'DQ10_7': 'familiarity_characterai'
}

analysis.get_demographics(
    df=post_exclusion_df,
    demographic_qs=demographic_qs,
    save_dir=demographics_save_dir,
    overwrite=overwrite)

[0624_145758] Demographics exists at ../analysis/demographics/demographics.csv


Unnamed: 0,variable,level,count,percent
0,age,25-34,137,29.148936
1,age,35-44,119,25.319149
2,age,45-54,109,23.191489
3,age,55-64,65,13.829787
4,age,65+,23,4.893617
...,...,...,...,...
88,familiarity_characterai,"Heard of it, but never used it",159,33.829787
89,familiarity_characterai,"Have used it a few times, but not regularly",20,4.255319
90,familiarity_characterai,Use 1-2x times a month,6,1.276596
91,familiarity_characterai,Use 1-2x a week,2,0.425532


## Clean data to obtain dataframe of ratings
Filter out other information besides participant's ratings on each of the 40 mental capacity items, condition, and prolificID.

In [8]:
items = utils.read_file(items_path)
groupings = utils.read_file(categories_path)
n_pages = 40 # Number of survey pages for mental capacity
n_items_per_page = 1
q_id = 'Q35'
n_total = n_pages * n_items_per_page
columns = df.columns

# MC = mental capacity
mc_mapping = analysis.get_mc_mapping(
    items=items,
    n_pages=n_pages,
    n_items_per_page=n_items_per_page,
    q_id=q_id,
    columns=columns
)

# Make dataframe of ratings (rows = participants; columns = mental states)
rating_df = analysis.get_ratings(
    df=post_exclusion_df,
    mc_mapping=mc_mapping,
    groupings=groupings,
    n_total=n_total,
    save_dir=ratings_save_dir,
    overwrite=overwrite)

# Get mean (and std) ratings of each mental state in separate dataframes for each condition
stats_dfs = analysis.mean_ratings(
    rating_df=rating_df,
    conditions=conditions,
    save_conditions=True,
    save_dir=rating_stats_save_dir,
    overwrite=overwrite)

# Combine list of stats for each condition into one large DF
master_stats = analysis.create_master_stats(
    stats_dfs=stats_dfs,
    save_dir=rating_stats_save_dir,
    overwrite=overwrite)

[0624_145832] Ratings exists at ../analysis/mental_capacities/ratings.csv
[0624_145832] Rating statistics exists at ../analysis/mental_capacities/statistics/rating_stats.csv
[0624_145832] Rating statistics exists at ../analysis/mental_capacities/statistics/novideo_rating_stats.csv
[0624_145832] Rating statistics exists at ../analysis/mental_capacities/statistics/machines_rating_stats.csv
[0624_145832] Rating statistics exists at ../analysis/mental_capacities/statistics/tools_rating_stats.csv
[0624_145832] Rating statistics exists at ../analysis/mental_capacities/statistics/companions_rating_stats.csv
[0624_145832] Master ratings exists at ../analysis/mental_capacities/statistics/all_conditions_ratings_stats.csv
