In [152]:
import pandas as pd
import seaborn as sns
import numpy as np
import os.path as op
import glob
import matplotlib.pyplot as plt
import ast
# import moss
import csv
import random
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning
import scipy
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning
import os
from pandas import Timestamp
from datetime import timedelta
from sklearn.preprocessing import StandardScaler


In [153]:
home_dir = op.abspath('./')

data_files = glob.glob(op.join(home_dir,
                        'data',
                        '*.csv'))
sns.set_context('talk')

In [154]:
column_name = 'image_recall_response.keys'
threshold = 0.4

In [155]:
creation_dates = []
for file_path in data_files:
    try:
        df = pd.read_csv(file_path)
        if column_name in df.columns:
            creation_date = df['date']
            creation_dates.append(creation_date)
    except Exception as e:
#             print(f"Error reading {file_path}: {e}")
        continue

In [156]:
def parse_dates(series):
    date_str = series.iloc[0]
    if "24h" in date_str:
        corrected_date_str = date_str.replace("24h", "00h")
        dt = pd.to_datetime(corrected_date_str, format='%Y-%m-%d_%Hh%M.%S.%f')
        dt += timedelta(days=1)
    else:
        dt = pd.to_datetime(date_str, format='%Y-%m-%d_%Hh%M.%S.%f')
    return dt

dates = [parse_dates(series) for series in creation_dates]

min_date = min(dates)


In [157]:
def remove_unit_variance(df, col, unit, group=None, suffix="_within"):
    """Remove variance between sampling units.

    This is useful for plotting repeated-measures data using within-unit
    error bars.

    Parameters
    ----------
    df : DataFrame
        Input data. Will have a new column added.
    col : column name
        Column in dataframe with quantitative measure to modify.
    unit : column name
        Column in dataframe defining sampling units (e.g., subjects).
    group : column name(s), optional
        Columns defining groups to remove unit variance within.
    suffix : string, optional
        Suffix appended to ``col`` name to create new column.

    Returns
    -------
    df : DataFrame
        Returns modified dataframe.

    """
    new_col = col + suffix

    def demean(x):
        return x - x.mean()

    if group is None:
        new = df.groupby(unit)[col].transform(demean)
        new += df[col].mean()
        df.loc[:, new_col] = new
    else:
        df.loc[:, new_col] = np.nan
        for level, df_level in df.groupby(group):
            new = df_level.groupby(unit)[col].transform(demean)
            new += df_level[col].mean()
            df.loc[new.index, new_col] = new

    return df


In [158]:
def df_demean(df, list_of_variables):
    for l in list_of_variables:
        df[l] = df[l] - np.mean(df[l])
        z_scored = f"{l}_z"
        df[z_scored] = scaler.fit_transform(df[[l]])

    return df

In [159]:
def df_square_and_mean(df, list_of_variables):
    for l in list_of_variables:
        squared_col = f"{l}_sq"
        df[squared_col] = df[l]**2
        df[squared_col] = df[squared_col] - np.mean(df[squared_col])
        z_scored = f"{squared_col}_z"
        df[z_scored] = scaler.fit_transform(df[[squared_col]])
    return df




In [160]:
def convert_string(value):
    try:
        # Check if value is a string and looks like a list
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            # Remove brackets and split by comma
            cleaned = value.strip('[]')
            if cleaned:  # Check if the string is not empty
                # Convert each part to float and calculate average
                numbers = [float(num) for num in cleaned.split(',')]
                return sum(numbers) / len(numbers)
            else:
                return None  # Return None or another placeholder for empty lists
        else:
            return None
    except Exception as e:
        print(f"Error converting value {value}: {e}")
        return None


In [161]:
def calculate_differences(dataframe):
    # Determine attended and unattended values for IT and V2
    dataframe['it_sim_dis_attend'] = np.where(dataframe['attend'] == 'img1', dataframe['IT_root_im1'], dataframe['IT_root_im2'])
    dataframe['v2_sim_dis_attend'] = np.where(dataframe['attend'] == 'img1', dataframe['V2_root_im1'], dataframe['V2_root_im2'])

    dataframe['it_sim_dis_test'] = np.where(dataframe['test_item'] == 'img1', dataframe['IT_root_im1'], dataframe['IT_root_im2'])
    dataframe['v2_sim_dis_test'] = np.where(dataframe['test_item'] == 'img1', dataframe['V2_root_im1'], dataframe['V2_root_im2'])

    
    dataframe['it_sim_dis_unattend'] = np.where(dataframe['attend'] != 'img1', dataframe['IT_root_im1'], dataframe['IT_root_im2'])
    dataframe['v2_sim_dis_unattend'] = np.where(dataframe['attend'] != 'img1', dataframe['V2_root_im1'], dataframe['V2_root_im2'])

    
    dataframe['it_sim_dis_untest'] = np.where(dataframe['test_item'] != 'img1', dataframe['IT_root_im1'], dataframe['IT_root_im2'])
    dataframe['v2_sim_dis_untest'] = np.where(dataframe['test_item'] != 'img1', dataframe['V2_root_im1'], dataframe['V2_root_im2'])

    # Calculate differences
    dataframe['it_sim_dis_diff'] = np.where(dataframe['attend'] == 'img1', dataframe['IT_root_im1'] - dataframe['IT_root_im2'], dataframe['IT_root_im2'] - dataframe['IT_root_im1'])
    dataframe['v2_sim_dis_diff'] = np.where(dataframe['attend'] == 'img1', dataframe['V2_root_im1'] - dataframe['V2_root_im2'], dataframe['V2_root_im2'] - dataframe['V2_root_im1'])

    dataframe['it_sim_dis_diff_test'] = np.where(dataframe['test_item'] == 'img1', dataframe['IT_root_im1'] - dataframe['IT_root_im2'], dataframe['IT_root_im2'] - dataframe['IT_root_im1'])
    dataframe['v2_sim_dis_diff_test'] = np.where(dataframe['test_item'] == 'img1', dataframe['V2_root_im1'] - dataframe['V2_root_im2'], dataframe['V2_root_im2'] - dataframe['V2_root_im1'])

    # Determine convergence and preferences
    dataframe['v2_converges'] = np.where((dataframe['it_sim_dis_diff'] > 0) & (dataframe['v2_sim_dis_diff'] > 0) | (dataframe['it_sim_dis_diff'] < 0) & (dataframe['v2_sim_dis_diff'] < 0), 'V2/IT agree', 'V2/IT disagree')

    dataframe['v2_prefers'] = np.where(dataframe['v2_sim_dis_diff'] > 0, 'Prioritized', 'Deprioritized')
    dataframe['it_prefers'] = np.where(dataframe['it_sim_dis_diff'] > 0, 'Prioritized', 'Deprioritized')
    
    
    dataframe['v2_prefers_test'] = np.where(dataframe['v2_sim_dis_diff_test'] > 0, 'Tested', 'Untested')
    dataframe['it_prefers_test'] = np.where(dataframe['it_sim_dis_diff_test'] > 0, 'Tested', 'Untested')
    
    
    dataframe['Distractor V2 Similarity Preference Tested'] = dataframe['v2_prefers_test'] 
    dataframe['Distractor IT Similarity Preference Tested'] = dataframe['it_prefers_test']
    
    
    
    dataframe['Distractor V2 Similarity Preference'] = dataframe['v2_prefers'] 
    dataframe['Distractor IT Similarity Preference'] = dataframe['it_prefers']
    
    dataframe['it_im1_im2'] = dataframe['IT_im1_im2']
    dataframe['v2_im1_im2'] = dataframe['V2_im1_im2']
    
    dataframe['IT_diff_binned'] = pd.qcut(dataframe['it_sim_dis_diff'], 5, duplicates='drop')
    dataframe['V2_diff_binned'] = pd.qcut(dataframe['v2_sim_dis_diff'], 5, duplicates='drop')
    
    dataframe['IT_diff_binned_test'] = pd.qcut(dataframe['it_sim_dis_diff_test'], 5, duplicates='drop')
    dataframe['V2_diff_binned_test'] = pd.qcut(dataframe['v2_sim_dis_diff_test'], 5, duplicates='drop')
    
    
    # dataframe = df_demean(dataframe, ['it_sim_dis_diff', 'v2_sim_dis_diff', 'it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend', 'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest', 'it_sim_dis_diff_test', 'v2_sim_dis_diff_test' ])

    # dataframe = df_square_and_mean(dataframe, ['it_sim_dis_diff', 'v2_sim_dis_diff', 'it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend', 'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest', 'it_sim_dis_diff_test', 'v2_sim_dis_diff_test' ])

    
    
    return dataframe

In [162]:
def categorize_columns(dataframe, column_params):
    """
    Categorizes specified columns in a DataFrame into discrete categories based on quantiles.

    Parameters:
    - dataframe: The DataFrame to process.
    - column_params: A dictionary specifying the number of categories and labels for each column prefix.

    Returns:
    - The modified DataFrame with additional columns for categorized data.
    """
    for label in ['it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend', 'it_sim_dis_diff', 'v2_sim_dis_diff', 'it_im1_im2', 'v2_im1_im2', 'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest', 'it_sim_dis_diff_test', 'v2_sim_dis_diff_test']:
        # Determine the column prefix to decide which set of parameters to use
        # 'v2_sim_dis_diff_sq', 'it_sim_dis_diff_sq', 'v2_sim_dis_diff_test_sq','it_sim_dis_diff_test_sq'
        column_prefix = 'v2' if 'v2' in label else 'it'
        
        # Extract the number of categories and labels from the parameters dictionary
        n_cats = column_params[column_prefix]['n_cats']
        labels = column_params[column_prefix]['labels']
        
        # Categorize the column data
        dataframe[label + '_cat'] = pd.qcut(dataframe[label], q=n_cats, labels=labels, duplicates='drop')
    
    return dataframe

In [163]:
def validity_assignment(df):
    x = []
    for i in df['validity']:
        if i == 'valid':
            x.append('prioritized')
        else:
            x.append('deprioritized')

    df['Tested Item'] = x

In [164]:
def df_column_addition(df):
    
    df['V2 Distractor Similarity\nto Prioritized Item'] = df['v2_sim_dis_attend_cat']
    df['IT Distractor Similarity\nto Prioritized Item'] = df['it_sim_dis_attend_cat']
    df['V2 Distractor Similarity\nto Deprioritized Item'] = df['v2_sim_dis_unattend_cat']
    df['IT Distractor Similarity\nto Deprioritized Item'] = df['it_sim_dis_unattend_cat']
    df['Prioritized - Deprioritized IT Distractor Similarity'] = df['it_sim_dis_diff_cat'] 
    df['Prioritized - Deprioritized V2 Distractor Similarity'] = df['v2_sim_dis_diff_cat'] 
    # df['Prioritized - Deprioritized IT Distractor Similarity Squared'] = df['v2_sim_dis_diff_sq_cat'] 
    # df['Prioritized - Deprioritized V2 Distractor Similarity Squared'] = df['it_sim_dis_diff_sq_cat'] 

    
    df['V2 Distractor Similarity\nto Tested Item'] = df['v2_sim_dis_test_cat']
    df['IT Distractor Similarity\nto Tested Item'] = df['it_sim_dis_test_cat']
    df['V2 Distractor Similarity\nto Untested Item'] = df['v2_sim_dis_untest_cat']
    df['IT Distractor Similarity\nto Untested Item'] = df['it_sim_dis_untest_cat']
    df['Tested - Untested IT Distractor Similarity'] = df['it_sim_dis_diff_test_cat'] 
    df['Tested - Untested V2 Distractor Similarity'] = df['v2_sim_dis_diff_test_cat'] 
    # df['Tested - Untested IT Distractor Similarity Squared'] = df['v2_sim_dis_diff_test_sq_cat'] 
    # df['Tested - Untested V2 Distractor Similarity Squared'] = df['it_sim_dis_diff_test_sq_cat'] 

   
    df['IT_diff_binned'] = pd.qcut(df['it_sim_dis_diff'], 5, duplicates='drop')
    df['V2_diff_binned'] = pd.qcut(df['v2_sim_dis_diff'], 5, duplicates='drop')
    # df['IT_diff_binned_sq'] = pd.qcut(df['it_sim_dis_diff_sq'], 5, duplicates='drop')
    # df['V2_diff_binned_sq'] = pd.qcut(df['v2_sim_dis_diff_sq'], 5, duplicates='drop')


    df['IT_diff_binned_test'] = pd.qcut(df['it_sim_dis_diff_test'], 5, duplicates='drop')
    df['V2_diff_binned_test'] = pd.qcut(df['v2_sim_dis_diff_test'], 5, duplicates='drop')
    # df['IT_diff_binned_test_sq'] = pd.qcut(df['it_sim_dis_diff_test_sq'], 5, duplicates='drop')
    # df['V2_diff_binned_test_sq'] = pd.qcut(df['v2_sim_dis_diff_test_sq'], 5, duplicates='drop')

    def round_to_significant_figures(x, sig_figs=2):
        if x == 0:
            return 0
        else:
            return round(x, sig_figs - int(np.floor(np.log10(abs(x)))) - 1)


    def process_interval(interval):
        # Round both bounds
        lower_rounded = round_to_significant_figures(interval.left)
        upper_rounded = round_to_significant_figures(interval.right)
        # Return a new interval with the rounded bounds
        return pd.Interval(lower_rounded, upper_rounded)

    df['IT_diff_binned'] = df['IT_diff_binned'].apply(process_interval)
    df['V2_diff_binned'] = df['V2_diff_binned'].apply(process_interval)
    # df['IT_diff_binned_sq'] = df['IT_diff_binned_sq'].apply(process_interval)
    # df['V2_diff_binned_sq'] = df['V2_diff_binned_sq'].apply(process_interval)

    df['IT_diff_binned_test'] = df['IT_diff_binned_test'].apply(process_interval)
    df['V2_diff_binned_test'] = df['V2_diff_binned_test'].apply(process_interval)
    # df['IT_diff_binned_test_sq'] = df['IT_diff_binned_test_sq'].apply(process_interval)
    # df['V2_diff_binned_test_sq'] = df['V2_diff_binned_test_sq'].apply(process_interval)

    df['Prioritized - Deprioritized V2 Distractor Similarity Ranges'] = df['V2_diff_binned']
    df['Prioritized - Deprioritized IT Distractor Similarity Ranges'] = df['IT_diff_binned']
    # df['Prioritized - Deprioritized V2 Distractor Similarity Squared Ranges'] = df['V2_diff_binned_sq']
    # df['Prioritized - Deprioritized IT Distractor Similarity Squared Ranges'] = df['IT_diff_binned_sq']

    df['Tested - Untested V2 Distractor Similarity Ranges'] = df['V2_diff_binned_test']
    df['Tested - Untested IT Distractor Similarity Ranges'] = df['IT_diff_binned_test']
    # df['Tested - Untested V2 Distractor Similarity Squared Ranges'] = df['V2_diff_binned_test_sq']
    # df['Tested - Untested IT Distractor Similarity Squared Ranges'] = df['IT_diff_binned_test_sq']
    
    
    df['tested_item'] = df['Tested Item']
    df['ret_rel'] = df['Retrocue Reliability']
    
    df['validity_binary'] = df['Tested Item'].apply(lambda x: 1 if x == 'prioritized' else 0)
    df['reliability_binary'] = df['Retrocue Reliability'].apply(lambda x: 1 if x == 'high' else 0)
    # df['validity_binary_z'] = scaler.fit_transform(df[['validity_binary']])
    # df['reliability_binary_z'] = scaler.fit_transform(df[['reliability_binary']])



    
    # df['V2 Distractor Similarity to Tested Item'] = df['v2_sim_dis_test_z']
    # df['IT Distractor Similarity to Tested Item'] = df['it_sim_dis_test_z']
    # df['Tested - Untested V2 Distractor Similarity'] = df['v2_sim_dis_diff_test_z']
    # df['Tested - Untested IT Distractor Similarity'] = df['it_sim_dis_diff_test_z']


    # df['V2 Distractor Similarity\nto Prioritized Item'] = df['v2_sim_dis_attend_z']
    # df['IT Distractor Similarity\nto Prioritized Item'] = df['it_sim_dis_attend_z']
    # df['V2 Distractor Similarity\nto Deprioritized Item'] = df['v2_sim_dis_unattend_z']
    # df['IT Distractor Similarity\nto Deprioritized Item'] = df['it_sim_dis_unattend_z']

    # df['Prioritized - Deprioritized IT Distractor Similarity'] = df['it_sim_dis_diff_z'] 
    # df['Prioritized - Deprioritized V2 Distractor Similarity'] = df['v2_sim_dis_diff_z'] 
    


In [165]:
def df_creation(data_files, start_date, end_date):
    processed_dfs = []
    date_column = 'date'

    for file_path in data_files:
        try:
            # Read the CSV file and append to the list of DataFrames
            temp_df = pd.read_csv(file_path)
            temp_df['filename'] = file_path
            processed_dfs.append(temp_df)
        except Exception as e:
            # Optionally, print or log the error
#             print(f"Failed to read {file_path}: {e}")
            continue

    if processed_dfs:
        df = pd.concat(processed_dfs, ignore_index=True)
        df[date_column] = pd.to_datetime(df[date_column], format='%Y-%m-%d_%Hh%M.%S.%f', errors='coerce')
        df.dropna(subset=[date_column], inplace=True)
        df = df[(df[date_column] >= pd.to_datetime(start_date)) & (df[date_column] <= pd.to_datetime(end_date))]
        
        # # Uncomment and adapt the following lines as needed:
        df = df.loc[df['V2_diff'].notnull()].reset_index(drop=True)
        df['reliability'] = df['reliability'].astype(float)
        df['Retrocue Reliability'] = np.where(df['reliability'] > 0.75, 'high', 'low')
        non_numeric_values = df['resp_correct'][~df['resp_correct'].apply(np.isreal)]
        numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
        result = df[numeric_columns].groupby('participant').mean()
        
        

    else:
        # Return an empty DataFrame if no files were processed successfully
        df = pd.DataFrame()
    
    return df, numeric_columns


   

In [166]:
# df, numeric_columns = df_creation(data_files,'2024-11-22', '2025-01-30')

### pilot5 '2024-10-08', '2024-10-30'
### pilot6 '2024-11-22', '2025-01-30'

In [167]:
df1, numeric_columns1 = df_creation(data_files,'2024-10-08', '2024-10-30')
df2, numeric_columns2 = df_creation(data_files,'2024-11-22', '2025-01-30')
df = pd.concat([df1, df2], axis=0)

numeric_columns = numeric_columns1
# 
### pilot5 '2024-10-08', '2024-10-30'
### pilot6 '2024-11-22', '2025-01-30'

In [168]:
def add_trial_info(df, participant_col, trials_per_batch=30):
    
    df['Trial_Number'] = df.groupby(participant_col).cumcount() + 1
    
    df['Trial_Batch'] = ((df['Trial_Number'] - 1) // trials_per_batch) + 1
    
    return df

add_trial_info(df, participant_col='participant')  

Unnamed: 0,setup_js.started,setup_js.stopped,participant,date,expName,psychopyVersion,OS,frameRate,Instructions1.started,Instructions1.stopped,...,trials_3.thisIndex,trials_3.ran,trials_3.order,taskPhases,conditions_choice,repeat_prac.order,prac_trials.order,Retrocue Reliability,Trial_Number,Trial_Batch
0,,,156250,2024-10-17 17:20:21.128,WM_Deepgen,2023.2.1,MacIntel,29.411765,,,...,,,,,,,,high,1,1
1,,,156250,2024-10-17 17:20:21.128,WM_Deepgen,2023.2.1,MacIntel,29.411765,,,...,,,,,,,,low,2,1
2,,,156250,2024-10-17 17:20:21.128,WM_Deepgen,2023.2.1,MacIntel,29.411765,,,...,,,,,,,,high,3,1
3,,,156250,2024-10-17 17:20:21.128,WM_Deepgen,2023.2.1,MacIntel,29.411765,,,...,,,,,,,,high,4,1
4,,,156250,2024-10-17 17:20:21.128,WM_Deepgen,2023.2.1,MacIntel,29.411765,,,...,,,,,,,,low,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51801,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,,,,,,,,high,296,10
51802,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,,,,,,,,high,297,10
51803,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,,,,,,,,low,298,10
51804,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,,,,,,,,high,299,10


In [169]:
df_trial_182 = df[df['trial'] == 182]
df_trial_182 = df_trial_182[['participant', 'trial', 'cond_file', 'root', 'IT_diff']]
df_trial_182['conditions_batch'] = df_trial_182.groupby(['cond_file', 'root', 'IT_diff']).ngroup() + 1

batch_info = df_trial_182.groupby('conditions_batch')['participant'].agg(
    participants_count='nunique',  # Count of unique participants
    participants_list='unique'  # List of participants in the batch
).reset_index()

df_trial_182 = df_trial_182.merge(batch_info, on='conditions_batch', how='left')



In [170]:
df = df.merge(df_trial_182[['participant', 'conditions_batch', 'participants_count', 'participants_list']], 
                     on='participant', 
                     how='left')


In [171]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')


In [172]:
column_params = {
    'v2': {'n_cats': 5, 'labels': ['Least Similar', '', '  ', '   ', 'Most Similar']},
    'it': {'n_cats': 5, 'labels': ['Least Similar', '', '  ', '   ', 'Most Similar']}
}
column_name = 'image_recall_response.keys'
threshold = 0.4

In [173]:
scaler = StandardScaler()


In [174]:
validity_assignment(df)
df = calculate_differences(df)
df = categorize_columns(df, column_params)
df_column_addition(df)
# label_addition(df)

  dataframe['Distractor IT Similarity Preference Tested'] = dataframe['it_prefers_test']
  dataframe['Distractor V2 Similarity Preference'] = dataframe['v2_prefers']
  dataframe['Distractor IT Similarity Preference'] = dataframe['it_prefers']
  dataframe['it_im1_im2'] = dataframe['IT_im1_im2']
  dataframe['v2_im1_im2'] = dataframe['V2_im1_im2']
  dataframe['IT_diff_binned'] = pd.qcut(dataframe['it_sim_dis_diff'], 5, duplicates='drop')
  dataframe['V2_diff_binned'] = pd.qcut(dataframe['v2_sim_dis_diff'], 5, duplicates='drop')
  dataframe['IT_diff_binned_test'] = pd.qcut(dataframe['it_sim_dis_diff_test'], 5, duplicates='drop')
  dataframe['V2_diff_binned_test'] = pd.qcut(dataframe['v2_sim_dis_diff_test'], 5, duplicates='drop')
  dataframe[label + '_cat'] = pd.qcut(dataframe[label], q=n_cats, labels=labels, duplicates='drop')
  dataframe[label + '_cat'] = pd.qcut(dataframe[label], q=n_cats, labels=labels, duplicates='drop')
  dataframe[label + '_cat'] = pd.qcut(dataframe[label], q=n_cats,

In [175]:
df['reliability_binary'] = (df['reliability'] > 0.7).astype(int)
df['validity_binary'] = (df['validity'] == 'valid').astype(int)


In [176]:
df = df[df['taskPhase'] == 'mainTask']

In [177]:


def process_values_time(value):
    try:
        # Strip square brackets and split by comma, filtering out empty values
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            cleaned = value.strip('[]').split(',')
            if cleaned == ['']:  # Check if the list after stripping is empty
                return None
            numbers = [float(num) for num in cleaned if num.strip()]
            return numbers
        return None
    except ValueError:
        return None

def process_values_click(value):
    try:
        # Strip square brackets and split by comma, filtering out empty values
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            cleaned = value.strip('[]').split(',')
            if cleaned == ['']:  # Check if the list after stripping is empty
                return None
            numbers = [num for num in cleaned if num.strip()]
            return numbers
        return None
    except ValueError:
        return None

    
df['processed_mouse.time'] = df['mouse.time'].apply(process_values_time)
df['processed_mouse.click'] = df['mouse.clicked_name'].apply(process_values_click)

def get_latest_value(entry):
    if isinstance(entry, list) and len(entry) > 1:
        return [entry[-1]]  # Return a list with the latest value
    return entry  # Return unchanged if not a list or list is empty/single element

df['processed_mouse.time'] = df['processed_mouse.time'].apply(get_latest_value)



In [178]:

df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Find the earliest date for each participant
earliest_dates = df.groupby('participant')['date'].min().reset_index()

df = pd.merge(df, earliest_dates, on=['participant', 'date'])



participant_summary = df.groupby('participant').agg(
    total_entries=pd.NamedAgg(column='participant', aggfunc='size')
).reset_index()

participant_file_counts = df.groupby(['participant', 'filename']).agg(
    entry_count=pd.NamedAgg(column='participant', aggfunc='size'),
    
    none_mouse=pd.NamedAgg(column='processed_mouse.time', aggfunc=lambda x: x.isna().sum())
).reset_index()

participant_file_counts


Unnamed: 0,participant,filename,entry_count,none_mouse
0,85174,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,187
1,116851,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,0
2,121021,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,2
3,123262,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,8
4,123790,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,3
...,...,...,...,...
325,170536,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,53
326,170665,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,5
327,171031,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,6
328,171253,/Users/lana/Desktop/psychoPyExperiments/wm_dee...,300,20


In [151]:


files_with_high_none_mouse = participant_file_counts[participant_file_counts['none_mouse'] > 100]

df_cleaned = df[~df.set_index(['participant', 'filename']).index.isin(files_with_high_none_mouse.set_index(['participant', 'filename']).index)]

participant_file_counts_cleaned = df_cleaned.groupby(['participant', 'filename']).agg(
    entry_count=pd.NamedAgg(column='participant', aggfunc='size')
).reset_index()

files_with_low_entries = participant_file_counts_cleaned[participant_file_counts_cleaned['entry_count'] < 200]

df_cleaned = df_cleaned[~df_cleaned.set_index(['participant', 'filename']).index.isin(files_with_low_entries.set_index(['participant', 'filename']).index)]

cleaned_participant_summary = df_cleaned.groupby('participant').agg(
    total_entries=pd.NamedAgg(column='participant', aggfunc='size')
).reset_index()


none_mouse_summary = df_cleaned.groupby('participant').agg(
    none_mouse=pd.NamedAgg(column='processed_mouse.time', aggfunc=lambda x: x.isna().sum())
).reset_index()

df = df_cleaned

Empty DataFrame
Columns: [participant, filename, entry_count]
Index: []


In [113]:
duplicate_participants = df.groupby('participant')['filename'].nunique().reset_index()
duplicate_participants = duplicate_participants[duplicate_participants['filename'] > 1]

if not duplicate_participants.empty:
    print("\nParticipants associated with more than one file:")
    print(duplicate_participants)
else:
    print("\nNo participants associated with more than one file.")


No participants associated with more than one file.


In [118]:
none_mouse_summary = df.groupby('participant').agg(
    none_mouse=pd.NamedAgg(column='processed_mouse.time', aggfunc=lambda x: x.isna().sum())
).reset_index()

# Check for participants with none_mouse more than 2/3 of total_entries
none_mouse_summary = none_mouse_summary.merge(cleaned_participant_summary, on='participant')
none_mouse_summary['ratio'] = none_mouse_summary['none_mouse'] / none_mouse_summary['total_entries']
participants_high_none_mouse = none_mouse_summary[none_mouse_summary['ratio'] > (2/3)]

# Print the results
print("Remaining participants with more than 300 entries:")
print(cleaned_participant_summary)

print("\nNone mouse summary for each participant:")
print(none_mouse_summary)

if not participants_high_none_mouse.empty:
    print("\nParticipants with none_mouse more than 2/3 of total entries:")
    print(participants_high_none_mouse)
else:
    print("\nNo participants have none_mouse more than 2/3 of total entries.")

Remaining participants with more than 300 entries:
     participant  total_entries
0         116851            300
1         121021            300
2         123262            300
3         123790            300
4         123985            300
..           ...            ...
288       170536            300
289       170665            300
290       171031            300
291       171253            300
292       171304            300

[293 rows x 2 columns]

None mouse summary for each participant:
     participant  none_mouse  total_entries     ratio
0         116851           0            300  0.000000
1         121021           2            300  0.006667
2         123262           8            300  0.026667
3         123790           3            300  0.010000
4         123985           7            300  0.023333
..           ...         ...            ...       ...
288       170536          53            300  0.176667
289       170665           5            300  0.016667
290       171

In [117]:
participant_counts = df['participant'].value_counts()
participants_less_than_300 = participant_counts[participant_counts < 300]
participants_less_than_300


Series([], Name: count, dtype: int64)

In [49]:
participant_counts = df['participant'].value_counts()

participants_over_300 = participant_counts[participant_counts > 300]
rows_to_keep = []

# These participants must be appearing in multiple files
if not participants_over_300.empty:
    print(f"Found {len(participants_over_300)} participants appearing in multiple files:")
    print(participants_over_300)
    
    # To verify, let's check which files contain these participants
    for participant in participants_over_300.index:
        files = df[df['participant'] == participant]['filename'].unique()
        print(f"\nParticipant {participant} appears in {len(files)} files:")
        # df_x = df[df['participant'] == participant]
        # print(df_x)
        for file in files:
            count = df[(df['participant'] == participant) & (df['filename'] == file)].shape[0]
            print(f"  - {file}: {count} times")

     
        participant_data = df[df['participant'] == participant]
        participant_data_limited = participant_data.head(300)
        rows_to_keep.extend(participant_data_limited.index.tolist())
        
        print(f"Keeping first 300 rows for participant {participant}, dropping {len(participant_data) - 300} rows")

# For all other participants, keep all their rows
for participant in participant_counts[participant_counts <= 300].index:
    participant_data = df[df['participant'] == participant]
    rows_to_keep.extend(participant_data.index.tolist())

# Create the cleaned dataframe by selecting only the rows to keep
df = df.loc[rows_to_keep]


Found 4 participants appearing in multiple files:
participant
143725    600
155275    600
168412    600
158164    600
Name: count, dtype: int64

Participant 143725 appears in 1 files:
  - /Users/lana/Desktop/psychoPyExperiments/wm_deepgen/data/143725_WM_Deepgen_2024-10-19_20h34.19.469.csv: 600 times
Keeping first 300 rows for participant 143725, dropping 300 rows

Participant 155275 appears in 1 files:
  - /Users/lana/Desktop/psychoPyExperiments/wm_deepgen/data/155275_WM_Deepgen_2024-11-26_23h34.42.272.csv: 600 times
Keeping first 300 rows for participant 155275, dropping 300 rows

Participant 168412 appears in 1 files:
  - /Users/lana/Desktop/psychoPyExperiments/wm_deepgen/data/168412_WM_Deepgen_2024-11-26_11h47.03.874.csv: 600 times
Keeping first 300 rows for participant 168412, dropping 300 rows

Participant 158164 appears in 1 files:
  - /Users/lana/Desktop/psychoPyExperiments/wm_deepgen/data/158164_WM_Deepgen_2024-12-03_19h16.38.399.csv: 600 times
Keeping first 300 rows for partic

In [50]:
participant_counts = df['participant'].value_counts()

participants_over_300 = participant_counts[participant_counts < 300]
if not participants_over_300.empty:
    print(f"Found {len(participants_over_300)} participants appearing in multiple files:")
    print(participants_over_300)
    
    # To verify, let's check which files contain these participants
    for participant in participants_over_300.index:
        files = df[df['participant'] == participant]['filename'].unique()
        print(f"\nParticipant {participant} appears in {len(files)} files:")
        # df_x = df[df['participant'] == participant]
        # print(df_x)
        for file in files:
            count = df[(df['participant'] == participant) & (df['filename'] == file)].shape[0]
            print(f"  - {file}: {count} times")


In [51]:
def add_length_column(df, column_name):
    if column_name in df.columns:
        # Clean and convert the values in the specified column
        def clean_and_get_length(value):
            if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
                cleaned = value.strip('[]').split(',')
                return len(cleaned)
            elif isinstance(value, list):
                return 0
            else:
                return 0  # Handle unexpected data types

        # Apply the cleaning and length calculation
        df.loc[:, column_name + '_length'] = df[column_name].apply(clean_and_get_length)
    else:
        print(f"Column '{column_name}' not found in DataFrame.")
    return df

def add_nthresp_column(df, column_name, n):
    if column_name in df.columns:
        # Clean and convert the values in the specified column
        def clean_and_get_nthresp(value):
            if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
                cleaned = value.strip('[]').split(',')
                return cleaned[n]
            elif isinstance(value, list):
                return 
            else:
                return  # Handle unexpected data types

        # Apply the cleaning and length calculation
        if n == 0:
            temp_str = '_first'
        elif n == -1:
            temp_str = '_last'
        df.loc[:, column_name + temp_str] = df[column_name].apply(clean_and_get_nthresp)
    else:
        print(f"Column '{column_name}' not found in DataFrame.")
    return df

add_length_column(df, 'mouse.time')
add_nthresp_column(df, 'mouse.time', 0)
add_nthresp_column(df, 'mouse.time', -1)
add_nthresp_column(df, 'mouse.clicked_name', 0)
add_nthresp_column(df, 'mouse.clicked_name', -1)

# long_instances = df[df['mouse.time_length'] > 1]
# print(long_instances)
# unique_mouse_time_length_values = df['mouse.time_length'].unique()
# unique_mouse_time_length_values

Unnamed: 0,setup_js.started,setup_js.stopped,participant,date,expName,psychopyVersion,OS,frameRate,Instructions1.started,Instructions1.stopped,...,ret_rel,validity_binary,reliability_binary,processed_mouse.time,processed_mouse.click,mouse.time_length,mouse.time_first,mouse.time_last,mouse.clicked_name_first,mouse.clicked_name_last
18708,,,143725,2024-10-19 20:34:19.469,WM_Deepgen,2023.2.1,Win32,142.857143,,,...,high,1,1,[1.6584000000357264],"[""memoranda_stim""]",1,1.6584000000357264,1.6584000000357264,"""memoranda_stim""","""memoranda_stim"""
18709,,,143725,2024-10-19 20:34:19.469,WM_Deepgen,2023.2.1,Win32,142.857143,,,...,high,1,1,[1.6584000000357264],"[""memoranda_stim""]",1,1.6584000000357264,1.6584000000357264,"""memoranda_stim""","""memoranda_stim"""
18710,,,143725,2024-10-19 20:34:19.469,WM_Deepgen,2023.2.1,Win32,142.857143,,,...,low,1,0,[2.4225000000001273],"[""memoranda_stim""]",1,2.4225000000001273,2.4225000000001273,"""memoranda_stim""","""memoranda_stim"""
18711,,,143725,2024-10-19 20:34:19.469,WM_Deepgen,2023.2.1,Win32,142.857143,,,...,low,1,0,[2.4225000000001273],"[""memoranda_stim""]",1,2.4225000000001273,2.4225000000001273,"""memoranda_stim""","""memoranda_stim"""
18712,,,143725,2024-10-19 20:34:19.469,WM_Deepgen,2023.2.1,Win32,142.857143,,,...,high,1,1,[2.945399999976189],"[""distractor_im_2""]",1,2.945399999976189,2.945399999976189,"""distractor_im_2""","""distractor_im_2"""
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99009,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,high,0,1,[2.561999999999898],"[""memoranda_stim""]",2,1.5450000000000728,2.561999999999898,"""memoranda_stim""","""memoranda_stim"""
99010,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,high,1,1,[2.030999999999949],"[""memoranda_stim""]",2,1.0139999999992142,2.030999999999949,"""memoranda_stim""","""memoranda_stim"""
99011,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,low,0,0,[2.0809999999992215],"[""distractor_im_1""]",1,2.0809999999992215,2.0809999999992215,"""distractor_im_1""","""distractor_im_1"""
99012,,,167431,2024-11-24 14:54:08.781,WM_Deepgen,2023.2.1,MacIntel,62.500000,,,...,high,1,1,[2.16399999999976],"[""memoranda_stim""]",2,1.1299999999991996,2.16399999999976,"""memoranda_stim""","""memoranda_stim"""


In [52]:
if 'participant' in df.columns and not df['participant'].isnull().all():
    print("'participant' column exists and has values.")
else:
    raise KeyError("The 'participant' column is missing or has no values.")

'participant' column exists and has values.


In [53]:
def clean_dataframe_from_nonresp(df):
    # Convert the column entries from string lists to actual lists of floats, or None for empty lists

    # Define a mask to filter out rows where the list is empty or any number is out of the specified range
    def filter_ranges(numbers):
        if numbers is None:
            return False
        return all(num for num in numbers)
    df = df[df['processed_mouse.time'].apply(filter_ranges)]
    df = df[df['processed_mouse.click'].apply(filter_ranges)]
    # Apply the mask
    return df

In [54]:
def df_with_threshold(df, numeric_columns, threshold):
    sub_df = df[numeric_columns].groupby('participant').mean().reset_index()
    above04_subs = sub_df.loc[sub_df['resp_correct'] >= threshold, 'participant']
    df_ret = pd.DataFrame()
    for s in above04_subs:
        df_ret = pd.concat([df_ret, df.loc[df['participant'] == s]])
    df_ret = remove_unit_variance(df_ret,'resp_correct','participant')
    df_ret['Accuracy'] = df_ret['resp_correct_within']
    return df_ret

df = df_with_threshold(df, numeric_columns, 0.4)

In [55]:
df_nonresp = clean_dataframe_from_nonresp(df)

participant_counts = df_nonresp['participant'].value_counts()
participant_counts_df = participant_counts.reset_index()
participant_counts_df.columns = ['participant', 'count']

print(min(participant_counts_df['count']))

201


In [56]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns


In [57]:
df['mouse.time_last'] = pd.to_numeric(df['mouse.time_last'], errors='coerce')


In [58]:
df = remove_unit_variance(df,'mouse.time_last','participant')


In [59]:
df['analysis_rt'] = df['mouse.time_last_within']

In [107]:
# df_memorability = pd.read_csv('predictions_pilot5.csv')
# df_memorability['filename'] = df_memorability['filename'].str.replace('new_stimuli', 'stimuli')
# memorability_dict = dict(zip(df_memorability['filename'], df_memorability['predictions']))
# df['tested_memorability_resmem'] = np.where(
#     df['test_item'] == 'img1',
#     df['img1'].astype(str).map(memorability_dict),
#     df['img2'].astype(str).map(memorability_dict)
# )

# df['untested_memorability_resmem'] = np.where(
#     df['test_item'] == 'img1',
#     df['img2'].astype(str).map(memorability_dict),
#     df['img1'].astype(str).map(memorability_dict)
# )

# df['attended_memorability_resmem'] = np.where(
#     df['attend'] == 'img1',
#     df['img1'].astype(str).map(memorability_dict),
#     df['img2'].astype(str).map(memorability_dict)
# )

# df['unattended_memorability_resmem'] = np.where(
#     df['attend'] == 'img1',
#     df['img2'].astype(str).map(memorability_dict),
#     df['img1'].astype(str).map(memorability_dict)
# )


# df['distractor_memorability'] = df['ping_img'].astype(str).map(memorability_dict)



In [60]:
df['pilot_number'] = np.where(df['date'] <= '2024-10-31', 1, 2)
df_1 = df[df['pilot_number'] == 1]
df_2 = df[df['pilot_number'] == 2]

len(set(df_1['participant'])), len(set(df_2['participant']))


(134, 137)

In [61]:
df

Unnamed: 0,setup_js.started,setup_js.stopped,participant,date,expName,psychopyVersion,OS,frameRate,Instructions1.started,Instructions1.stopped,...,mouse.time_length,mouse.time_first,mouse.time_last,mouse.clicked_name_first,mouse.clicked_name_last,resp_correct_within,Accuracy,mouse.time_last_within,analysis_rt,pilot_number
23508,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,1,1.9971000000236927,1.9971,"""memoranda_stim""","""memoranda_stim""",0.826519,0.826519,2.132520,2.132520,1
23509,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,1,0.8651999999881355,0.8652,"""memoranda_stim""","""memoranda_stim""",0.826519,0.826519,1.000620,1.000620,1
23510,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,1,1.6270999999940159,1.6271,"""memoranda_stim""","""memoranda_stim""",0.826519,0.826519,1.762520,1.762520,1
23511,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,1,1.5147999999821877,1.5148,"""memoranda_stim""","""memoranda_stim""",0.826519,0.826519,1.650220,1.650220,1
23512,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,1,1.848400000005995,1.8484,"""memoranda_stim""","""memoranda_stim""",0.826519,0.826519,1.983820,1.983820,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78016,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,1,1.4794999999994616,1.4795,"""memoranda_stim""","""memoranda_stim""",0.913186,0.913186,1.549268,1.549268,2
78017,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,1,2.1292999999968742,2.1293,"""memoranda_stim""","""memoranda_stim""",0.913186,0.913186,2.199068,2.199068,2
78018,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,1,1.262499999999818,1.2625,"""distractor_im_2""","""distractor_im_2""",-0.086814,-0.086814,1.332268,1.332268,2
78019,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,1,1.4624999999996362,1.4625,"""memoranda_stim""","""memoranda_stim""",0.913186,0.913186,1.532268,1.532268,2


In [62]:

def add_memorability(df, mem_file):
    
    df_memorability = pd.read_csv(mem_file)
    df_memorability['filename'] = df_memorability['filename'].str.replace('new_stimuli', 'stimuli')
    memorability_dict = dict(zip(df_memorability['filename'], df_memorability['predictions']))
    df['tested_memorability_resmem'] = np.where(
        df['test_item'] == 'img1',
        df['img1'].astype(str).map(memorability_dict),
        df['img2'].astype(str).map(memorability_dict)
    )
    
    df['untested_memorability_resmem'] = np.where(
        df['test_item'] == 'img1',
        df['img2'].astype(str).map(memorability_dict),
        df['img1'].astype(str).map(memorability_dict)
    )
    
    df['attended_memorability_resmem'] = np.where(
        df['attend'] == 'img1',
        df['img1'].astype(str).map(memorability_dict),
        df['img2'].astype(str).map(memorability_dict)
    )
    
    df['unattended_memorability_resmem'] = np.where(
        df['attend'] == 'img1',
        df['img2'].astype(str).map(memorability_dict),
        df['img1'].astype(str).map(memorability_dict)
    )
    

    df['distractor_memorability'] = df['ping_img'].astype(str).map(memorability_dict)
    df['tested_memorability_resmem_z'] = scaler.fit_transform(df[['tested_memorability_resmem']])
add_memorability(df_1, 'predictions_pilot5.csv')
add_memorability(df_2, 'predictions_pilot6.csv')
df = pd.concat([df_1, df_2], ignore_index=True)
# add_memorability(df, 'predictions_pilot6.csv')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tested_memorability_resmem'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['untested_memorability_resmem'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['attended_memorability_resmem'] = np.where(
A value is trying to be set on a copy of a slice from a DataFrame.

In [63]:
df = remove_unit_variance(df,'mouse.time_last','participant')
df['analysis_rt'] = df['mouse.time_last_within']


df  = df_demean(df, ['it_sim_dis_diff', 'v2_sim_dis_diff', 'it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend', 'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest', 'it_sim_dis_diff_test', 'v2_sim_dis_diff_test' ])

df = df_square_and_mean(df, ['it_sim_dis_diff', 'v2_sim_dis_diff', 'it_sim_dis_attend', 'v2_sim_dis_attend', 'it_sim_dis_unattend', 'v2_sim_dis_unattend', 'it_sim_dis_test', 'v2_sim_dis_test', 'it_sim_dis_untest', 'v2_sim_dis_untest', 'it_sim_dis_diff_test', 'v2_sim_dis_diff_test' ])

df['validity_binary_z'] = scaler.fit_transform(df[['validity_binary']])
df['reliability_binary_z'] = scaler.fit_transform(df[['reliability_binary']])




df['V2 Distractor Similarity to Tested Item'] = df['v2_sim_dis_test_z']
df['IT Distractor Similarity to Tested Item'] = df['it_sim_dis_test_z']
df['Tested - Untested V2 Distractor Similarity'] = df['v2_sim_dis_diff_test_z']
df['Tested - Untested IT Distractor Similarity'] = df['it_sim_dis_diff_test_z']


df['V2 Distractor Similarity\nto Prioritized Item'] = df['v2_sim_dis_attend_z']
df['IT Distractor Similarity\nto Prioritized Item'] = df['it_sim_dis_attend_z']
df['V2 Distractor Similarity\nto Deprioritized Item'] = df['v2_sim_dis_unattend_z']
df['IT Distractor Similarity\nto Deprioritized Item'] = df['it_sim_dis_unattend_z']

df['Prioritized - Deprioritized IT Distractor Similarity'] = df['it_sim_dis_diff_z'] 
df['Prioritized - Deprioritized V2 Distractor Similarity'] = df['v2_sim_dis_diff_z'] 



  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[z_scored] = scaler.fit_transform(df[[l]])
  df[squared_col] = df[l]**2
  df[z_scored] = scaler.fit_transform(df[[squared_col]])
  df[squared_col] = df[l]**2
  df[z_scored] = scaler.fit_transform(df[[squared_col]])
  df[squared_col] = df[l]**2
  df[z_scored] = scaler.fit_transform(df[[squared_col]])
  df[squared_col] = df[l]**2
  df[z_scored] = scaler.fit_transform(df[[squared_col]])
  df[squared_col] = df[l]**2
  df[z_scored] = scaler.fit_transform(df[[squared_col]])
  df[s

In [64]:
# df_int = df[
#     (df['Tested - Untested IT Distractor Similarity'] <= 0.15) &
#     (df['Tested - Untested IT Distractor Similarity'] >= -0.15)
# ]
# df_int


<!-- # df['tested_memorability_resmem_z'] = scaler.fit_transform(df[['tested_memorability_resmem']]) -->

In [65]:
df['it_pos_neg'] =  np.where(df['it_sim_dis_diff_test'] <= 0, -1, 1)
df['v2_pos_neg'] =  np.where(df['v2_sim_dis_diff_test'] <= 0, -1, 1)
df['it_pos_neg_abs'] =  np.where(df['it_sim_dis_test'] <= 0, -1, 1)
df['v2_pos_neg_abs'] =  np.where(df['v2_sim_dis_test'] <= 0, -1, 1)

  df['it_pos_neg'] =  np.where(df['it_sim_dis_diff_test'] <= 0, -1, 1)
  df['v2_pos_neg'] =  np.where(df['v2_sim_dis_diff_test'] <= 0, -1, 1)
  df['it_pos_neg_abs'] =  np.where(df['it_sim_dis_test'] <= 0, -1, 1)
  df['v2_pos_neg_abs'] =  np.where(df['v2_sim_dis_test'] <= 0, -1, 1)


In [66]:
from sklearn.preprocessing import StandardScaler

def flip_z_sq_z(df, column_name):
    scaler = StandardScaler()
    
    df[column_name + '_sign'] = np.where(df[column_name] <= 0, -1, 1) * df[column_name]
    
    df[column_name + '_sign_z'] = scaler.fit_transform(df[[column_name + '_sign']])
    
    df[column_name + '_sign_sq'] = df[column_name + '_sign_z'] ** 2
    
    df[column_name + '_sign_sq_z'] = scaler.fit_transform(df[[column_name + '_sign_sq']])
    
    return df

# Apply the function to each column in the list
for l in ['it_sim_dis_diff_test', 'v2_sim_dis_diff_test', 'it_sim_dis_test', 'v2_sim_dis_test']:
    df = flip_z_sq_z(df, l)

   

  df[column_name + '_sign'] = np.where(df[column_name] <= 0, -1, 1) * df[column_name]
  df[column_name + '_sign_z'] = scaler.fit_transform(df[[column_name + '_sign']])
  df[column_name + '_sign_sq'] = df[column_name + '_sign_z'] ** 2
  df[column_name + '_sign_sq_z'] = scaler.fit_transform(df[[column_name + '_sign_sq']])
  df[column_name + '_sign'] = np.where(df[column_name] <= 0, -1, 1) * df[column_name]
  df[column_name + '_sign_z'] = scaler.fit_transform(df[[column_name + '_sign']])
  df[column_name + '_sign_sq'] = df[column_name + '_sign_z'] ** 2
  df[column_name + '_sign_sq_z'] = scaler.fit_transform(df[[column_name + '_sign_sq']])
  df[column_name + '_sign'] = np.where(df[column_name] <= 0, -1, 1) * df[column_name]
  df[column_name + '_sign_z'] = scaler.fit_transform(df[[column_name + '_sign']])
  df[column_name + '_sign_sq'] = df[column_name + '_sign_z'] ** 2
  df[column_name + '_sign_sq_z'] = scaler.fit_transform(df[[column_name + '_sign_sq']])
  df[column_name + '_sign'] = np.w

In [67]:
df['rt'] = df['mouse.time_last']

  df['rt'] = df['mouse.time_last']


In [68]:
###z scoring  - make sure not to override it in analysis, and do sq z scorings

In [69]:

df['it_pos_neg_abs_z'] =  scaler.fit_transform(df[['it_pos_neg_abs']])
df['v2_pos_neg_abs_z'] = scaler.fit_transform(df[['v2_pos_neg_abs']])
df['it_pos_neg_z'] = scaler.fit_transform(df[['it_pos_neg']])
df['v2_pos_neg_z'] = scaler.fit_transform(df[['v2_pos_neg']])

  df['it_pos_neg_abs_z'] =  scaler.fit_transform(df[['it_pos_neg_abs']])
  df['v2_pos_neg_abs_z'] = scaler.fit_transform(df[['v2_pos_neg_abs']])
  df['it_pos_neg_z'] = scaler.fit_transform(df[['it_pos_neg']])
  df['v2_pos_neg_z'] = scaler.fit_transform(df[['v2_pos_neg']])


In [70]:
df['it_pos_neg']

0       -1
1        1
2        1
3        1
4       -1
        ..
81295    1
81296    1
81297    1
81298    1
81299    1
Name: it_pos_neg, Length: 81300, dtype: int64

In [71]:
df['it_int_rel'] = df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_z'] 
df['v2_int_rel'] = df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_z'] 
df['it_int_abs'] = df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_z'] 
df['v2_int_abs'] = df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_z'] 
df['it_int_rel_sq'] = df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_sq_z'] 
df['v2_int_rel_sq'] = df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_sq_z'] 
df['it_int_abs_sq'] = df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_sq_z'] 
df['v2_int_abs_sq'] = df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_sq_z'] 

  df['it_int_rel'] = df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_z']
  df['v2_int_rel'] = df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_z']
  df['it_int_abs'] = df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_z']
  df['v2_int_abs'] = df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_z']
  df['it_int_rel_sq'] = df['it_pos_neg'] * df['it_sim_dis_diff_test_sign_sq_z']
  df['v2_int_rel_sq'] = df['v2_pos_neg'] * df['v2_sim_dis_diff_test_sign_sq_z']
  df['it_int_abs_sq'] = df['it_pos_neg_abs'] * df['it_sim_dis_test_sign_sq_z']
  df['v2_int_abs_sq'] = df['v2_pos_neg_abs'] * df['v2_sim_dis_test_sign_sq_z']


In [72]:
len(set(df['participant']))
# df1 = pd.read_csv('pilot5.csv')
# df2 = pd.read_csv('pilot6.csv')

# Concatenate the dataframes
# combined_df = pd.concat([df1, df2], ignore_index=True)

# Save to a new CSV if needed
df.to_csv('pilot_total_old.csv', index=False)

In [73]:
df_quest = pd.read_csv('wm_questionnaires_preprocessed.csv')
# df_quest = df_quest.dropna(subset = ['vviq_sum'])
df_quest

Unnamed: 0.1,Unnamed: 0,subject_id,irq-catch-1,irq-manipulation-1,irq-manipulation-2,irq-manipulation-3,irq-manipulation-4,irq-manipulation-5,irq-manipulation-6,irq-manipulation-7,...,osivq_visual_mean,osivq_spatial_mean,irq_verbal_mean,irq_visual_mean,z_osivq_verbal_mean,z_osivq_visual_mean,z_osivq_spatial_mean,z_irq_verbal_mean,z_irq_visual_mean,failed_check_count
0,0,85174,1,3,3,3,1,3,3,3,...,2.666667,2.933333,2.750000,2.9,0.272313,-0.946143,0.132862,-1.369943,-0.856266,0
1,1,111154,1,4,2,4,1,4,1,4,...,3.466667,3.066667,3.333333,2.9,0.419940,0.341235,0.381374,-0.453404,-0.856266,0
2,2,111754,1,3,3,3,3,4,3,3,...,2.733333,3.000000,3.000000,3.0,-0.022942,-0.838862,0.257118,-0.977141,-0.708436,1
3,3,114154,4,4,4,4,3,2,4,3,...,2.333333,2.800000,3.583333,3.4,-0.465825,-1.482551,-0.115651,-0.060601,-0.117115,2
4,4,114541,1,2,4,4,4,2,2,3,...,2.800000,3.600000,4.083333,3.6,-1.646845,-0.731580,1.375424,0.725004,0.178545,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587,587,170536,1,2,2,2,1,5,2,3,...,2.266667,3.200000,3.333333,2.2,-0.170570,-1.589833,0.629887,-0.453404,-1.891076,0
588,588,170665,1,2,3,1,4,3,2,2,...,3.266667,2.400000,3.666667,3.5,0.715195,0.019390,-0.861188,0.070333,0.030715,0
589,589,171031,1,1,1,1,1,5,1,1,...,3.200000,2.533333,2.083333,2.4,-0.170570,-0.087891,-0.612676,-2.417417,-1.595416,1
590,590,171253,1,3,3,4,4,1,3,3,...,3.466667,3.000000,3.750000,3.6,0.715195,0.341235,0.257118,0.201267,0.178545,1


In [74]:


# df_quest = pd.read_csv('wm_questionnaires_preprocessed.csv')

print("DataFrame info:")
print(df_quest.info())

print("\nFirst 5 rows:")
print(df_quest.head())
unique_subject_ids = df_quest['subject_id'].unique()
print(f"\nNumber of unique subject_ids: {len(unique_subject_ids)}")
print(f"Sample of unique subject_ids: {unique_subject_ids[:5]}")

if 'participant' in df.columns:
    unique_participants_wm = df['participant'].unique()
    print(f"\nNumber of unique participants: {len(unique_participants_wm)}")
    print(f"Sample of unique participants: {unique_participants_wm[:5]}")
    
common_participants = np.intersect1d(unique_participants_wm, unique_subject_ids)
print(f"Number of participants in both datasets: {len(common_participants)}")
print(f"Percentage of questionnaire participants in existing data: {len(common_participants)/len(unique_participants_wm)*100:.2f}%")


DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Columns: 118 entries, Unnamed: 0 to failed_check_count
dtypes: float64(11), int64(107)
memory usage: 545.9 KB
None

First 5 rows:
   Unnamed: 0  subject_id  irq-catch-1  irq-manipulation-1  \
0           0       85174            1                   3   
1           1      111154            1                   4   
2           2      111754            1                   3   
3           3      114154            4                   4   
4           4      114541            1                   2   

   irq-manipulation-2  irq-manipulation-3  irq-manipulation-4  \
0                   3                   3                   1   
1                   2                   4                   1   
2                   3                   3                   3   
3                   4                   4                   3   
4                   4                   4                   4   

   irq-manipulatio

In [179]:
merged_df['subject_id']

0        116851.0
1        116851.0
2        116851.0
3        116851.0
4        116851.0
           ...   
81295    171304.0
81296    171304.0
81297    171304.0
81298    171304.0
81299    171304.0
Name: subject_id, Length: 78600, dtype: float64

In [75]:
merged_df = df.merge(
    df_quest,
    left_on='participant',  # Column in df
    right_on='subject_id',  # Column in df_quest
    how='left'              # Keep all rows from df
)


merged_df

Unnamed: 0,setup_js.started,setup_js.stopped,participant,date,expName,psychopyVersion,OS,frameRate,Instructions1.started,Instructions1.stopped,...,osivq_visual_mean,osivq_spatial_mean,irq_verbal_mean,irq_visual_mean,z_osivq_verbal_mean,z_osivq_visual_mean,z_osivq_spatial_mean,z_irq_verbal_mean,z_irq_visual_mean,failed_check_count
0,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
1,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
2,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
3,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
4,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81295,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81296,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81297,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81298,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0


In [76]:
na_count = merged_df['vviq_sum'].isna().sum()

# Display the result
print(f"Number of NA values in vviq_sum column: {na_count}")
merged_df = merged_df.dropna(subset = ['vviq_sum'])
na_count = merged_df['vviq_sum'].isna().sum()
merged_df

Number of NA values in vviq_sum column: 2700


Unnamed: 0,setup_js.started,setup_js.stopped,participant,date,expName,psychopyVersion,OS,frameRate,Instructions1.started,Instructions1.stopped,...,osivq_visual_mean,osivq_spatial_mean,irq_verbal_mean,irq_visual_mean,z_osivq_verbal_mean,z_osivq_visual_mean,z_osivq_spatial_mean,z_irq_verbal_mean,z_irq_visual_mean,failed_check_count
0,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
1,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
2,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
3,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
4,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81295,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81296,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81297,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81298,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0


In [77]:
def is_z_scored(df, column_name, tolerance=0.01):
    """
    Check if a column is already z-scored (standardized).
    
    Parameters:
    -----------
    df : pandas DataFrame
        The DataFrame containing the column to check
    column_name : str
        The name of the column to check
    tolerance : float, default=0.01
        The tolerance for considering mean close to 0 and std close to 1
        
    Returns:
    --------
    bool
        True if the column appears to be z-scored, False otherwise
    """
    values = df[column_name].dropna()
    
    mean_val = values.mean()
    std_val = values.std()
    
    is_mean_zero = abs(mean_val) < tolerance
    is_std_one = abs(std_val - 1) < tolerance
    
    print(f"Column: {column_name}")
    print(f"Mean: {mean_val:.6f} (should be ~0 for z-scored data)")
    print(f"Std: {std_val:.6f} (should be ~1 for z-scored data)")
    print(f"Is z-scored: {is_mean_zero and is_std_one}")
    
    return is_mean_zero and is_std_one



In [78]:
merged_df = merged_df.dropna(subset = ['osivq_visual_mean'])
merged_df

Unnamed: 0,setup_js.started,setup_js.stopped,participant,date,expName,psychopyVersion,OS,frameRate,Instructions1.started,Instructions1.stopped,...,osivq_visual_mean,osivq_spatial_mean,irq_verbal_mean,irq_visual_mean,z_osivq_verbal_mean,z_osivq_visual_mean,z_osivq_spatial_mean,z_irq_verbal_mean,z_irq_visual_mean,failed_check_count
0,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
1,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
2,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
3,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
4,,,116851,2024-10-12 15:22:21.820,WM_Deepgen,2024.1.5,Win32,59.52381,,,...,4.533333,3.933333,2.166667,4.2,0.272313,2.057739,1.996705,-2.286482,1.065525,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81295,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81296,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81297,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0
81298,,,171304,2025-01-29 19:25:41.828,WM_Deepgen,2023.2.1,MacIntel,59.88024,,,...,3.000000,2.000000,3.750000,3.6,-1.646845,-0.409736,-1.606726,0.201267,0.178545,0.0


In [79]:
participant_counts = merged_df['participant'].value_counts()

# Check if any participant has more than 300 rows
participants_over_300 = participant_counts[participant_counts > 300]

if len(participants_over_300) > 0:
    print(f"Found {len(participants_over_300)} participants with more than 300 rows:")
    # print(participants

In [80]:
# merged_df.to_csv('pilot_total_old_merged_questionnaires.csv', index=False)

In [81]:
len(set(merged_df['participant']))

262

In [82]:
merged_df['distractor_0']

0          ./stimuli/distractors/hatbox_01b.jpg
1           ./stimuli/distractors/chaps_10s.jpg
2        ./stimuli/distractors/aardvark_06s.jpg
3           ./stimuli/distractors/navel_03s.jpg
4            ./stimuli/distractors/tick_05s.jpg
                          ...                  
81295      ./stimuli/distractors/poodle_07s.jpg
81296     ./stimuli/distractors/counter_10s.jpg
81297     ./stimuli/distractors/bunkbed_09s.jpg
81298        ./stimuli/distractors/slug_04s.jpg
81299        ./stimuli/distractors/gate_05s.jpg
Name: distractor_0, Length: 78600, dtype: object

In [83]:
# merged_df['pilot_number'] = np.where(df['date'] <= '2024-10-31', 1, 2)
df_1 = merged_df[merged_df['pilot_number'] == 1]
df_2 = merged_df[merged_df['pilot_number'] == 2]

len(set(df_1['participant'])), len(set(df_2['participant']))


(130, 132)

In [84]:
df_1 = df[df['pilot_number'] == 1]
df_2 = df[df['pilot_number'] == 2]

len(set(df_1['participant'])), len(set(df_2['participant']))


(134, 137)

| Term                                                         | Old Stimuli | New Stimuli | Total Data |
| :----------------------------------------------------------- | :---------: | :---------: | :--------: |
| (Intercept)                                                  | ***         | ***         | ***        |
| it_sim_dis_diff_test_sign_z                                  | ***         | *           | ***        |
| it_sim_dis_diff_test_sign_sq_z                               | *           | *           |            |
| it_pos_neg                                                   | **          | ***         | ***        |
| reliability_binary_z                                         | *           | *           | **         |
| validity_binary_z                                            | ***         | ***         | ***        |
| v2_sim_dis_diff_test_sign_z                                  | ***         | ***         | ***        |
| v2_sim_dis_diff_test_sign_sq_z                               | **          |             |            |
| v2_pos_neg                                                   | **          | ***         |            |
| it_sim_dis_diff_test_sign_z:it_pos_neg                       |             | ***         | ***        |
| it_sim_dis_diff_test_sign_sq_z:it_pos_neg                    | **          | ***         | *          |
| reliability_binary_z:validity_binary_z                       | ***         | ***         | ***        |
| v2_sim_dis_diff_test_sign_sq_z:v2_pos_neg                    |             | *           | *          |
| reliability_binary_z:v2_sim_dis_diff_test_sign_sq_z          | .           |             |            |
| validity_binary_z:v2_sim_dis_diff_test_sign_sq_z             |             | .           |            |
| it_sim_dis_diff_test_sign_sq_z:it_pos_neg:validity_binary_z  | **          |             | **         |
| it_sim_dis_diff_test_sign_sq_z:it_pos_neg:reliability_binary_z:validity_binary_z |             | .           | *          |
| reliability_binary_z:validity_binary_z:v2_sim_dis_diff_test_sign_z:v2_pos_neg    |             |             | .          |


| Term                                                         | Old Stimuli | New Stimuli | Total Data |
| :----------------------------------------------------------- | :---------: | :---------: | :--------: |
| (Intercept)                                                  | ***         | ***         | ***        |
| it_sim_dis_diff_test_sign_z                                  | ***         | *           | ***        |
| it_sim_dis_diff_test_sign_sq_z                               | *           | *           |            |
| it_pos_neg                                                   | **          | ***         | ***        |
| reliability_binary_z                                         | *           | *           | **         |
| validity_binary_z                                            | ***         | ***         | ***        |
| v2_sim_dis_diff_test_sign_z                                  | ***         | ***         | ***        |
| v2_sim_dis_diff_test_sign_sq_z                               | **          |             |            |
| v2_pos_neg                                                   | **          | ***         |            |
| it_sim_dis_diff_test_sign_z:it_pos_neg                       |             | ***         | ***        |
| it_sim_dis_diff_test_sign_sq_z:it_pos_neg                    | **          | ***         | *          |
| reliability_binary_z:validity_binary_z                       | ***         | ***         | ***        |
| v2_sim_dis_diff_test_sign_sq_z:v2_pos_neg                    |             | *           | *          |
| reliability_binary_z:v2_sim_dis_diff_test_sign_sq_z          | .           |             |            |
| validity_binary_z:v2_sim_dis_diff_test_sign_sq_z             |             | .           |            |
| it_sim_dis_diff_test_sign_sq_z:it_pos_neg:validity_binary_z  | **          |             | **         |
| it_sim_dis_diff_test_sign_sq_z:it_pos_neg:reliability_binary_z:validity_binary_z |             | .           | *          |
| reliability_binary_z:validity_binary_z:v2_sim_dis_diff_test_sign_z:v2_pos_neg    |             |             | .          |
