## Data analysis
- Data Analysis for: Nudging Healthy Choices: Leveraging LLM-Generated Hashtags and Explanations in Personalized Food Recommendations
- UMAP2025

In [26]:
## import libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import math

import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload 
%autoreload

import sys
sys.path.insert(0,'./rd_data')
from rd_data import *

palette = 'Set2'
sns.set_style("darkgrid")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Pre-processing 

In [27]:
## Extract approved
def approved(prolific_export_pth, personal_info_pth,profile_pth, selected_pth, evaluation_pth, condition_pth, n_condition):
    DF = pd.read_csv(prolific_export_pth)
    perInfo = pd.read_csv(personal_info_pth)
    
    profile = pd.read_csv(profile_pth)
    profile = profile.drop(columns=['id','title'])
    
    selected = pd.read_csv(selected_pth)
    selected = selected.drop(columns=['id','created','session_id'])
    
    evaluation = pd.read_csv(evaluation_pth)
    evaluation = evaluation.drop(columns=['id','title','created','session_id'])
    
    
    approved = DF.loc[DF.Status == 'APPROVED']
    approved_Info = pd.merge(approved['Participant id'],perInfo, left_on='Participant id', right_on='session_id')
    
    approved_Info = approved_Info.drop(columns=['Participant id','title', 'created'])
    approved_Info.rename(columns={'id':'person'}, inplace=True)
    
    
    Info_profile = pd.merge(approved_Info, profile, on='person')
    Info_profile_selected = pd.merge(Info_profile, selected, on='person')
    Info_profile_selected_eval = pd.merge(Info_profile_selected, evaluation, on='person')
    
    Info_profile_selected_eval.replace({'Strongly_Disagree':1, 'Disagree':2, 'Neutral':3, 'Agree':4, 'Strongly_Agree':5}, inplace=True)
    Info_profile_selected_eval['condition'] = condition_pth
    Info_profile_selected_eval['n_condition'] = n_condition
    
    return Info_profile_selected_eval

### Merge condition data

In [28]:
## no label approved 
prolific_export = './noLabel/No_ProlificExport.csv'
personal_info = './noLabel/NoPersonalInfo.csv'
profile = './noLabel/NoProfile.csv'
selected = './noLabel/NoRecipe.csv'
evaluation ='./noLabel/NoEvaluateChoices.csv'

No_label_condition = approved(prolific_export, personal_info,profile, selected, evaluation, 'noLabel', 0)
No_label_condition.person = No_label_condition.person + 1000
No_label_condition.rename(
    columns={
        'understandability':'under_1',
        'effectiveness':'under_2',
        'nudge_eval':'under_3',
    }, inplace=True    
)

No_label_condition[0:60].to_csv('./cnd_csv/Nolabel.csv', index=False)



## label condi
prolific_export = './Labels/No_ProlificExport.csv'
personal_info = './Labels/NoPersonalInfo.csv'
profile = './Labels/NoProfile.csv'
selected = './Labels/NoRecipe.csv'
evaluation ='./Labels/NoEvaluateChoices.csv'

Labels_condition = approved(prolific_export, personal_info,profile, selected, evaluation, 'Label',1)
Labels_condition.person = Labels_condition.person + 2000
Labels_condition = Labels_condition[0:60]
Labels_condition.rename(
    columns={
        'understandability':'under_1',
        'satisfaction':'under_2',
        'effectiveness':'under_3',
        
        'persuasiveness':'use_1',
        'nudge_eval':'use_2'
    }, inplace=True    
)

Labels_condition.to_csv('./cnd_csv/labels.csv', index=False)

## hash
prolific_export = './Hashtags/No_ProlificExport.csv'
personal_info = './Hashtags/NoPersonalInfo.csv'
profile = './Hashtags/NoProfile.csv'
selected = './Hashtags/NoRecipe.csv'
evaluation ='./Hashtags/NoEvaluateChoices.csv'

Hashtags_condition = approved(prolific_export, personal_info,profile, selected, evaluation, 'Hashtags',3)
Hashtags_condition.person = Hashtags_condition.person + 3000
Hashtags_condition = Hashtags_condition[0:60]
Hashtags_condition.rename(
    columns={
        'understandability':'under_1',
        'satisfaction':'under_2',
        'effectiveness':'under_3',
        
        'persuasiveness':'use_1',
        'nudge_eval':'use_2'
    }, inplace=True    
)
Hashtags_condition.to_csv('./cnd_csv/hashtags.csv', index=False)

## Explanation
prolific_export = './Explanation/No_ProlificExport.csv'
personal_info = './Explanation/NoPersonalInfo.csv'
profile = './Explanation/NoProfile.csv'
selected = './Explanation/NoRecipe.csv'
evaluation ='./Explanation/NoEvaluateChoices.csv'

Explanation_condition = approved(prolific_export, personal_info,profile, selected, evaluation, 'Explanation',4)
Explanation_condition.person = Explanation_condition.person + 5000
Explanation_condition =  Explanation_condition[0:60]
Explanation_condition.rename(
    columns={
        'understandability':'under_1',
        'satisfaction':'under_2',
        'effectiveness':'under_3',
        
        'persuasiveness':'use_1',
        'nudge_eval':'use_2'
    }, inplace=True    
)
Explanation_condition.to_csv('./cnd_csv/explanation.csv', index=False)


allCondition = pd.concat([No_label_condition, Labels_condition, Hashtags_condition, Explanation_condition], axis=0)
allCondition.to_csv('./cnd_csv/allCondition.csv', index=False)
allCondition.shape

(240, 49)

In [29]:
allCondition.condition.value_counts()

condition
noLabel        60
Label          60
Hashtags       60
Explanation    60
Name: count, dtype: int64

In [30]:
allCondition.age.value_counts()

age
b25_35      83
b35_45      60
b45_55      41
b18_24      31
bover_55    25
Name: count, dtype: int64

In [31]:
allCondition.gender.value_counts()

gender
Female            145
Male               93
refuse_to_disc      2
Name: count, dtype: int64

In [32]:
allCondition.education.value_counts()

education
BA                  100
High_school          81
MSc                  50
Doctorate             4
Not                   3
Less_high_school      2
Name: count, dtype: int64

In [33]:
# Group by 'condition' and 'Healthiness'
grouped = allCondition.groupby(['condition', 'healthiness']).size().reset_index(name='count')
# Calculate total counts per condition
total_per_condition = grouped.groupby('condition')['count'].transform('sum')

# Add a new column for percentage
grouped['percentage'] = (grouped['count'] / total_per_condition) * 100
# Display the result
print(grouped)

     condition healthiness  count  percentage
0  Explanation     healthy     50   83.333333
1  Explanation   unhealthy     10   16.666667
2     Hashtags     healthy     45   75.000000
3     Hashtags   unhealthy     15   25.000000
4        Label     healthy     51   85.000000
5        Label   unhealthy      9   15.000000
6      noLabel     healthy     37   61.666667
7      noLabel   unhealthy     23   38.333333


In [34]:
allCondition.fsa_score.value_counts()

fsa_score
6     101
10     55
5      52
4      30
11      2
Name: count, dtype: int64

### ANOVA across all conditions 

In [35]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# Fit the OLS model for 'Labelling Condition' only (no interactions with other factors)
model = ols('fsa_score ~ C(condition)', data=allCondition).fit()

# Perform the ANOVA
anova_results = anova_lm(model)

# Display the ANOVA table
print(anova_results)

                 df      sum_sq    mean_sq         F    PR(>F)
C(condition)    3.0   58.750000  19.583333  4.690936  0.003346
Residual      236.0  985.233333   4.174718       NaN       NaN


In [36]:
# Recode the 'Condition' column to set 'nolabel' as the baseline
# The 'nolabel' category will be the reference category (baseline)
df_condition = allCondition
df_condition['condition'] = pd.Categorical(df_condition['condition'], categories=['noLabel', 'Label','Explanation', 'Hashtags'], ordered=False)
# Create dummy variables for 'Condition', excluding 'nolabel' to be the reference category
data_dummies = pd.get_dummies(df_condition['condition'], drop_first=True)

# Add the dummy variables to the original data
data_with_dummies = pd.concat([df_condition, data_dummies], axis=1)

# Fit the OLS model using the dummy variables (label1, label2, label3) as predictors
model = ols('fsa_score ~ Label + Explanation+Hashtags', data=data_with_dummies).fit()

# Perform the ANOVA to check the significance of the model
anova_results = anova_lm(model)

# Display the ANOVA table
print("ANOVA results for the model with dummy variables:")
print(anova_results)

ANOVA results for the model with dummy variables:
                df      sum_sq    mean_sq         F    PR(>F)
Label          1.0   16.805556  16.805556  4.025555  0.045957
Explanation    1.0   25.069444  25.069444  6.005064  0.014992
Hashtags       1.0   16.875000  16.875000  4.042190  0.045514
Residual     236.0  985.233333   4.174718       NaN       NaN


- F-statistics: These show how much each condition (relative to "nolabel") affects the variance in the FSA_score.

        - The F-statistic for explanation is 6.00, which indicates the amount of variance explained by this condition compared to nolabel.
        - The F-statistic for hashtags is  4.042190, and labels is 4.025555, both of which show the amount of variance explained by these conditions compared to nolabel.

In [37]:
# Model summary to check the overall F-statistic
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              fsa_score   R-squared:                       0.056
Model:                            OLS   Adj. R-squared:                  0.044
Method:                 Least Squares   F-statistic:                     4.691
Date:                Mon, 27 Jan 2025   Prob (F-statistic):            0.00335
Time:                        15:40:50   Log-Likelihood:                -510.01
No. Observations:                 240   AIC:                             1028.
Df Residuals:                     236   BIC:                             1042.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               7.2833    

####  LLM and Labels intervention

In [38]:
label_DF = allCondition.loc[(allCondition['condition'] == 'noLabel') | (allCondition['condition'] == 'Label')]
llm_DF = allCondition.loc[(allCondition['condition'] == 'Hashtags') | (allCondition['condition'] == 'Explanation')]

label_DF.to_csv('./cnd_csv/labels_data.csv', index=False)
llm_DF.to_csv('./cnd_csv/llm_data.csv', index=False)

In [39]:
llm_DF.condition.value_counts()

condition
Explanation    60
Hashtags       60
noLabel         0
Label           0
Name: count, dtype: int64

#### Labels intervention

In [40]:
# Recode the 'Condition' column to set 'nolabel' as the baseline
# The 'nolabel' category will be the reference category (baseline)

label_DF['condition'] = pd.Categorical(label_DF['condition'], categories=['noLabel', 'Label'], ordered=False)
# Create dummy variables for 'Condition', excluding 'nolabel' to be the reference category
label_data_dummies = pd.get_dummies(label_DF['condition'], drop_first=True)

# Add the dummy variables to the original data
label_data_with_dummies = pd.concat([label_DF, label_data_dummies], axis=1)

# Fit the OLS model using the dummy variables (label1, label2, label3) as predictors
model = ols('fsa_score ~ Label', data=label_data_with_dummies).fit()

# Perform the ANOVA to check the significance of the model
anova_results = anova_lm(model)

# Display the ANOVA table
print("ANOVA results for the model with dummy variables:")
print(anova_results)

ANOVA results for the model with dummy variables:
             df      sum_sq    mean_sq          F    PR(>F)
Label       1.0   46.875000  46.875000  11.194219  0.001101
Residual  118.0  494.116667   4.187429        NaN       NaN


In [41]:
# Model summary to check the overall F-statistic
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              fsa_score   R-squared:                       0.087
Model:                            OLS   Adj. R-squared:                  0.079
Method:                 Least Squares   F-statistic:                     11.19
Date:                Mon, 27 Jan 2025   Prob (F-statistic):            0.00110
Time:                        15:40:50   Log-Likelihood:                -255.19
No. Observations:                 120   AIC:                             514.4
Df Residuals:                     118   BIC:                             520.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         7.2833      0.264     27.570

#### LLM interventions


In [42]:
# Recode the 'Condition' column to set 'nolabel' as the baseline
# The 'nolabel' category will be the reference category (baseline)

llm_DF['condition'] = pd.Categorical(llm_DF['condition'], categories=['Hashtags','Explanation' ], ordered=False)
# Create dummy variables for 'Condition', excluding 'nolabel' to be the reference category
llm_data_dummies = pd.get_dummies(llm_DF['condition'], drop_first=True)

# Add the dummy variables to the original data
llm_data_with_dummies = pd.concat([llm_DF, llm_data_dummies], axis=1)

# Fit the OLS model using the dummy variables (llm1, llm2, llm3) as predictors
model = ols('fsa_score ~ Explanation', data=llm_data_with_dummies).fit()

# Perform the ANOVA to check the significance of the model
anova_results = anova_lm(model)

# Display the ANOVA table
print("ANOVA results for the model with dummy variables:")
print(anova_results)

ANOVA results for the model with dummy variables:
                df      sum_sq   mean_sq       F    PR(>F)
Explanation    1.0    5.208333  5.208333  1.2514  0.265557
Residual     118.0  491.116667  4.162006     NaN       NaN


In [43]:
# Model summary to check the overall F-statistic
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              fsa_score   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.251
Date:                Mon, 27 Jan 2025   Prob (F-statistic):              0.266
Time:                        15:40:50   Log-Likelihood:                -254.82
No. Observations:                 120   AIC:                             513.6
Df Residuals:                     118   BIC:                             519.2
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               6.5333    

In [44]:
allCondition.condition.value_counts()

condition
noLabel        60
Label          60
Explanation    60
Hashtags       60
Name: count, dtype: int64

### Prepare Data for CFA

In [45]:
allCondition.condition.value_counts()

condition
noLabel        60
Label          60
Explanation    60
Hashtags       60
Name: count, dtype: int64

In [46]:
 # Transform the negatively formulate questions
allCondition[['FK_12']] = 6 -  allCondition[['FK_12']]
allCondition[['sus_4']] = 6 -  allCondition[['sus_4']]
allCondition.know_many = 6 - allCondition.know_many
allCondition.easy_choice = 6 - allCondition.easy_choice
allCondition.unders_sys = 6 - allCondition.unders_sys




In [47]:
## User evaluation mean
allCondition['choice_satisfaction'] = allCondition[['liked_recipes','prepare_recipes','fit_preference','recommend_recipe']].mean(axis=1)
allCondition['choice_difficulty'] = allCondition[['many_to_choose','easy_choice','choice_overwhelming']].mean(axis=1)
allCondition['perceived_effort'] = allCondition[['sys_time','unders_sys','many_actions']].mean(axis=1)
allCondition['SFD'] = allCondition[['FK_9','FK_10','FK_11','FK_12']].mean(axis=1)
allCondition['Sustain'] = allCondition[['sus_1', 'sus_2', 'sus_3', 'sus_4']].mean(axis=1)



In [48]:
allCondition['understand'] = allCondition[['under_1','under_2','under_3']].mean(axis=1)
allCondition['usability'] = allCondition[['use_1','use_2']].mean(axis=1)

In [49]:
allCondition.to_csv('./cnd_csv/cfa_DF.csv', index=False)

### Usability test

In [50]:
## Nudging conditions
nudge_df = allCondition.loc[allCondition.condition != 'noLabel']
nudge_df.to_csv('./cnd_csv/nudge_df.csv')

In [51]:
nudge_df = pd.read_csv('./cnd_csv/nudge_df.csv')

In [52]:
nudge_df.condition.value_counts()

condition
Label          60
Hashtags       60
Explanation    60
Name: count, dtype: int64

In [61]:
# Recode the 'Condition' column to set 'nolabel' as the baseline
# The 'nolabel' category will be the reference category (baseline)
df_condition = nudge_df
df_condition['condition'] = pd.Categorical(df_condition['condition'], categories=['Label','Explanation', 'Hashtags'], ordered=False)
# Create dummy variables for 'Condition', excluding 'nolabel' to be the reference category
data_dummies = pd.get_dummies(df_condition['condition'], drop_first=True)

# Add the dummy variables to the original data
data_with_dummies = pd.concat([df_condition, data_dummies], axis=1)

# Fit the OLS model using the dummy variables (label1, label2, label3) as predictors
model = ols('usability ~Explanation+Hashtags', data=data_with_dummies).fit()

# Perform the ANOVA to check the significance of the model
anova_results = anova_lm(model)

# Display the ANOVA table
print("ANOVA results for the model with dummy variables:")
print(anova_results)

ANOVA results for the model with dummy variables:
                df      sum_sq    mean_sq          F        PR(>F)
Explanation    1.0    1.002778   1.002778   1.132594  2.886724e-01
Hashtags       1.0   24.300000  24.300000  27.445800  4.554111e-07
Residual     177.0  156.712500   0.885381        NaN           NaN


In [54]:
# Model summary to check the overall F-statistic
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              usability   R-squared:                       0.139
Model:                            OLS   Adj. R-squared:                  0.129
Method:                 Least Squares   F-statistic:                     14.29
Date:                Mon, 27 Jan 2025   Prob (F-statistic):           1.77e-06
Time:                        15:40:51   Log-Likelihood:                -242.94
No. Observations:                 180   AIC:                             491.9
Df Residuals:                     177   BIC:                             501.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               2.7646    

In [55]:
allcondition_no_use = allCondition.drop(columns=['use_1','use_2','usability'])
allcondition_no_use.to_csv('./cnd_csv/allcondition_no_use.csv', index=False)

In [56]:
allCondition[['usability']]

Unnamed: 0,usability
0,
1,
2,
3,
4,
...,...
55,4.0
56,4.5
57,4.0
58,2.5


In [57]:
mean = allCondition.understand.mean()
allCondition['under_level'] = 'High'
allCondition.loc[allCondition.understand <= mean, 'under_level' ] = 'Low'

mean = allCondition.SFD.mean()
allCondition['SFD_level'] = 'High'
allCondition.loc[allCondition.understand <= mean, 'SFD_level' ] = 'Low'

In [58]:
allCondition.to_csv('cnd_csv/allCond.csv', index=False)