In [33]:
# !pip install numpy pandas scikit-learn matplotlib seaborn scipy statsmodels

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, levene
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.utils import resample

------------- Q1–2: Load and Prepare Data --------------------

In [2]:
df = pd.read_csv("data/Chandler.csv")  # Load data
df.dropna(inplace=True)  # Check for missing values
df.reset_index(drop=True, inplace=True)  # Reset index

In [3]:
df.isna().sum()  # Check for missing values

DROP            0
Participant     0
Age             0
Gender          0
Year            0
Anchortype      0
magnitude       0
Condition       0
pen             0
Proteindrink    0
lebron          0
slidy           0
Cheese          0
Figurine        0
TV              0
beachhouse      0
number          0
Notes           0
dtype: int64

In [4]:
df['Anchortype_num'] = df['Anchortype'].map({'round': 0, 'precise': 1})  # Encode precision
df_clean = df[df['DROP'] == 0].copy()  # Remove excluded participants

------------- Q3: Reconstruct Anchors from Janiszewski Materials -------------

In [5]:
anchor_values = {
    'pen': {0: 4.000, 1: 3.998},
    'Proteindrink': {0: 10.0, 1: 9.8},
    'lebron': {0: 0.500, 1: 0.498},
    'slidy': {0: 40.0, 1: 39.75},
    'Cheese': {0: 5.0, 1: 4.85},
    'Figurine': {0: 50.0, 1: 49.0},
    'TV': {0: 5000.0, 1: 4998.0},
    'beachhouse': {0: 800000.0, 1: 799800.0},
    'number': {0: 10000.0, 1: 9989.0}
}

In [6]:
# Calculate relative underestimation
rel_diffs = []
for item in anchor_values:
    anchor_col = f'{item}_anchor'
    df_clean[anchor_col] = df_clean['Anchortype_num'].map(anchor_values[item])
    rel_diff = (df_clean[anchor_col] - df_clean[item]) / df_clean[anchor_col]
    rel_diffs.append(rel_diff)

# df_clean['mean_underestimation'] = np.nanmean(np.column_stack(rel_diffs), axis=1)
df_clean['mean_underestimation'] = np.mean(np.column_stack(rel_diffs), axis=1)

------------- Q4–5: Prepare Data for ANOVA and Check Assumptions -------------

In [7]:
anova_df = df_clean[
    [
        'Participant',
        'mean_underestimation',
        'Anchortype_num',
        'magnitude'
    ]
].dropna()

anova_df['Anchortype'] = anova_df['Anchortype_num'].map({0: 'Round', 1: 'Precise'})

anova_df['Motivation'] = anova_df['magnitude'].map({0: 'Weak', 1: 'Strong'})

In [8]:
# Shapiro–Wilk (normality)
shapiro_results = anova_df.groupby(
        ['Anchortype', 'Motivation']
    )['mean_underestimation'].apply(
    lambda x: shapiro(x)[1]).reset_index(name='Shapiro_p')

In [9]:
shapiro_results

Unnamed: 0,Anchortype,Motivation,Shapiro_p
0,Precise,Strong,4.969372e-06
1,Precise,Weak,0.02959953
2,Round,Strong,6.015536e-11
3,Round,Weak,9.877863e-07


In [10]:
# Levene's test (homogeneity of variance)
levene_stat, levene_p = levene(
    *[group['mean_underestimation'].values for _, group in anova_df.groupby(['Anchortype', 'Motivation'])]
)

In [11]:
levene_stat

np.float64(1.1184990544563833)

In [12]:
levene_p

np.float64(0.34510724942684956)

------------- Q6–7: Two-Way ANOVA and Effect Sizes -----------------

In [13]:
model = ols('mean_underestimation ~ C(Anchortype) * C(Motivation)', data=anova_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

Anova table after performing standard normalisation

In [14]:
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Anchortype),0.820406,1.0,0.466572,0.496087
C(Motivation),0.092634,1.0,0.052682,0.818911
C(Anchortype):C(Motivation),1.254627,1.0,0.713517,0.400218
Residual,182.870475,104.0,,


In [15]:
eta_sq = anova_table['sum_sq'] / anova_table['sum_sq'].sum()  # eta squared = effect size

In [16]:
eta_sq

C(Anchortype)                  0.004434
C(Motivation)                  0.000501
C(Anchortype):C(Motivation)    0.006780
Residual                       0.988285
Name: sum_sq, dtype: float64