In [1]:
import pandas as pd

# Read the Excel file
file_path = "Confusion_data.xlsx"
df = pd.read_excel(file_path)

# (Optional) Preview the first few rows
df


Unnamed: 0,Participant,Hypothesis,TimeWindow,TP,TN,FP,FN
0,ID01,H1,20min,57,23,11,1
1,ID01,H1,40min,17,19,3,1
2,ID01,H1,60min,10,11,1,2
3,ID01,H2,20min,68,22,9,4
4,ID01,H2,40min,24,19,3,3
5,ID01,H2,60min,9,17,2,2
6,ID02,H1,20min,77,16,8,3
7,ID02,H1,40min,18,22,2,2
8,ID02,H1,60min,17,9,1,1
9,ID02,H2,20min,76,24,17,5


In [2]:
import pandas as pd
from scipy.stats import wilcoxon

# --- Filter to Hypothesis 1 only ---
df_h1 = df[df["Hypothesis"] == "H1"]

# --- Per-participant descriptive stats (TP, TN, FP, FN) ---
participant_stats = (
    df_h1
    .groupby("Participant")[["TP", "TN", "FP", "FN"]]
    .agg(["mean", "std", "sum", "count"])
)

print(participant_stats)


                    TP                               TN                        \
                  mean        std  sum count       mean        std  sum count   
Participant                                                                     
ID01         28.000000  25.357445   84     3  17.666667   6.110101   53     3   
ID02         37.333333  34.355980  112     3  15.666667   6.506407   47     3   
ID03         43.666667  27.209067  131     3  16.000000   9.848858   48     3   
ID04         32.333333  22.479620   97     3  21.333333  12.858201   64     3   
ID05         33.666667  22.501852  101     3  38.666667  25.403412  116     3   
ID06         46.000000  35.552778  138     3  14.666667   2.516611   44     3   
ID07         41.333333  32.929217  124     3  25.666667  12.583057   77     3   
ID08         31.000000  20.000000   93     3  24.000000  19.924859   72     3   

                   FP                            FN                      
                 mean       std su

In [3]:
# --- Aggregate counts per participant ---
summary = (
    df_h1
    .groupby("Participant")[["TP", "TN", "FP", "FN"]]
    .sum()
    .reset_index()
)

# Correct vs incorrect classifications
summary["Correct"] = summary["TP"] + summary["TN"]
summary["Incorrect"] = summary["FP"] + summary["FN"]

# Difference (what we test)
summary["Diff"] = summary["Correct"] - summary["Incorrect"]

print(summary)


  Participant   TP   TN  FP  FN  Correct  Incorrect  Diff
0        ID01   84   53  15   4      137         19   118
1        ID02  112   47  11   6      159         17   142
2        ID03  131   48  11   7      179         18   161
3        ID04   97   64  12   9      161         21   140
4        ID05  101  116  14   8      217         22   195
5        ID06  138   44   9  14      182         23   159
6        ID07  124   77  10   5      201         15   186
7        ID08   93   72  14   7      165         21   144


In [4]:
stat, p_value = wilcoxon(summary["Diff"])

print(f"Wilcoxon signed-rank test statistic: {stat}")
print(f"p-value: {p_value:.4f}")

import numpy as np

# Rank-biserial correlation (effect size for Wilcoxon)
n = len(summary)
r = 1 - (2 * stat) / (n * (n + 1))

print(f"Rank-biserial correlation r = {r:.3f}")


Wilcoxon signed-rank test statistic: 0.0
p-value: 0.0078
Rank-biserial correlation r = 1.000


In [5]:
from scipy.stats import mannwhitneyu

# Use the summary dataframe you already created
# summary columns: Participant, TP, TN, FP, FN, Correct, Incorrect, Diff

u_stat, p_value_mw = mannwhitneyu(
    summary["Correct"],
    summary["Incorrect"],
    alternative="greater"   # we expect Correct > Incorrect
)

print(f"Mann–Whitney U statistic: {u_stat}")
print(f"p-value: {p_value_mw:.4f}")

import numpy as np

n1 = len(summary["Correct"])
n2 = len(summary["Incorrect"])

# Rank-biserial correlation for Mann–Whitney
r_mw = 1 - (2 * u_stat) / (n1 * n2)

print(f"Mann–Whitney effect size r = {r_mw:.3f}")


Mann–Whitney U statistic: 64.0
p-value: 0.0005
Mann–Whitney effect size r = -1.000
