# HCI PixelPal Experimental Study | Survey Data Post-Processing

PixelPal Study was a mixed design, therefore both wide format and long format are required for testing the proposed hypotheses.

Please refer the PixelPal Project Report for more details, anyways, here are the 3 hypothesis under consideration:
- **H1**: Higher user engagement and emotional disclosure will be seen in multimodal AI interactions with animated avatars than in text-only interactions.
- **H2**: Users’ personality traits and technology comfort levels will determine the effectiveness with which interaction modalities and avatar types promote emotional disclosure and perceived empathy.
- **H3**: Users will find greater improvement in mental well-being and social connectedness when interacting with a multimodal AI companion compared to traditional text-based systems

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np

# Converting Likert to Numeric

In [None]:
def convert_likert_to_numeric(value):
    mapping = {
        '0 - Not at all': 0,
        '1 - Several Days': 1,
        '2 - More than half the days': 2,
        '3 - Nearly every day': 3
    }
    return mapping.get(str(value).strip(), 0)  # Default to 0 if not found

# PHQ 9 Calculation

In [None]:
def calculate_phq9_score(row):
    phq9_cols = [col for col in row.index if any(keyword in col for keyword in [
        'Little interest',
        'Feeling down',
        'Trouble falling',
        'Feeling tired',
        'Poor appetite',
        'Feeling bad about yourself',
        'Trouble concentrating',
        'Moving or speaking',
        'Thoughts that you would'
    ])][:9]  # Take only first 9 matches
    return sum(convert_likert_to_numeric(row[col]) for col in phq9_cols)

# GAD 7 Calculation

In [None]:
def calculate_gad7_score(row):
    gad7_cols = [col for col in row.index if any(keyword in col for keyword in [
        'Feeling nervous',
        'Not being able to stop',
        'Worrying too much',
        'Trouble relaxing',
        'Being so restless',
        'Becoming easily annoyed',
        'Feeling afraid'
    ])][:7]  # Take only first 7 matches
    return sum(convert_likert_to_numeric(row[col]) for col in gad7_cols)

# System Usability Score Calculation

In [None]:
def calculate_sus_score(row):
    sus_cols = [col for col in row.index if col.startswith('I think') or 
                col.startswith('I found') or 
                col.startswith('I thought') or 
                col.startswith('I imagine') or 
                col.startswith('I felt')][:10]
    
    scores = []
    for i, col in enumerate(sus_cols):
        try:
            value = float(row[col])
            if i % 2 == 0:  # Positive items (1,3,5,7,9)
                scores.append(value - 1)
            else:  # Negative items (2,4,6,8,10)
                scores.append(5 - value)
        except (ValueError, TypeError):
            continue
    
    return sum(scores) * 2.5 if scores else np.nan

# Social Connectedness Score Calculation

In [None]:
def calculate_social_connectedness(row):
    sc_cols = [col for col in row.index if col.startswith('I feel') or 
               col.startswith('I am') or 
               col.startswith('I see') or 
               col.startswith("I don't")][:20]
    
    reverse_items = [3, 6, 7, 9, 11, 13, 15, 17, 18, 20]  # 1-based index
    scores = []
    
    for i, col in enumerate(sc_cols, 1):
        try:
            value = float(row[col])
            if i in reverse_items:
                scores.append(7 - value)
            else:
                scores.append(value)
        except (ValueError, TypeError):
            continue
    
    return np.mean(scores) if scores else np.nan

# JASP Data Preparation

In [None]:
def prepare_data_for_jasp(pre_df, post_df):
    # Create participant IDs
    participants = pd.DataFrame({
        'participant_id': range(1, len(pre_df) + 1),
        'age': pre_df['Age'],
        'group': post_df['What model of communication did you prefer? '].map({
            'Avatar based': 'avatar',
            'Text/ Voice/ Simple Journaling': 'text'
        })
    })
    
    # Calculate scores
    participants['pre_phq9'] = pre_df.apply(calculate_phq9_score, axis=1)
    participants['post_phq9'] = post_df.apply(calculate_phq9_score, axis=1)
    participants['pre_gad7'] = pre_df.apply(calculate_gad7_score, axis=1)
    participants['post_gad7'] = post_df.apply(calculate_gad7_score, axis=1)
    participants['sus_score'] = post_df.apply(calculate_sus_score, axis=1)
    participants['social_connectedness'] = post_df.apply(calculate_social_connectedness, axis=1)
    
    return participants

# Long Format CSV Generation

In [None]:
def create_long_format(wide_df):
    # Create long format for repeated measures
    long_df = pd.melt(wide_df,
                      id_vars=['participant_id', 'age', 'group', 'sus_score', 'social_connectedness'],
                      value_vars=['pre_phq9', 'post_phq9', 'pre_gad7', 'post_gad7'],
                      var_name='measure_time',
                      value_name='score')
    
    # Split measure_time into measure and time
    long_df[['measure', 'time']] = long_df['measure_time'].str.split('_', expand=True)
    long_df = long_df.drop('measure_time', axis=1)
    
    return long_df

# Wide Format and Main Function to Orchestrate Everything!

In [None]:
def main():
    # Read the survey data
    pre_survey = pd.read_csv('pre_survey_responses.csv')
    post_survey = pd.read_csv('post_survey_responses.csv')
    
    # Prepare wide format data
    wide_format = prepare_data_for_jasp(pre_survey, post_survey)
    wide_format.to_csv('jasp_analysis_data_wide.csv', index=False)
    
    # Create and save long format data
    long_format = create_long_format(wide_format)
    long_format.to_csv('jasp_analysis_data_long.csv', index=False)
    
    print("Data processing complete. Files saved as 'jasp_analysis_data_wide.csv' and 'jasp_analysis_data_long.csv'")

if __name__ == "__main__":
    main()