In [2]:
import re
import numpy as np
import pandas as pd
from scipy import stats
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import statsmodels.formula.api as smf
from pandas.api.types import CategoricalDtype
from scipy.stats import chi2, norm
from collections import defaultdict
from scipy.stats import pearsonr

**Calculate Mean and StD**

In [12]:
res = [0.877,
       0.9067,
       0.9406]

mean_res = np.mean(res)
std_res = np.std(res)

print(mean_res)
print(std_res)

0.9081
0.025983456275099348


**Get Mean Accuracies for different setups**

In [None]:
path = "path/to/result/file"

with open(path, encoding="utf-8", errors="ignore") as f:
    lines = f.readlines()

records = []
current_domain, current_mode, current_seed, current_target = None, None, None, None
in_seed_block = False

re_domain   = re.compile(r"^=== TEST DOMAIN:\s+([A-Za-z0-9_]+)\s+===")
re_mode     = re.compile(r"^Mode:\s*([A-Z_0-9]+)\s*$")
re_seed     = re.compile(r"^Seed:\s*(\d+)\s*$")
re_target   = re.compile(r"^\s*Target\s+([A-Za-z0-9_]+):\s*$")
re_accuracy = re.compile(r"^\s*Accuracy:\s*([0-9]*\.[0-9]+)\s*$")

def norm_domain(d): return d.lower().replace("_", " ")
def norm_mode(m): return m.lower()

for line in lines:
    m = re_domain.match(line)
    if m:
        current_domain = norm_domain(m.group(1))
        continue
    m = re_mode.match(line)
    if m:
        current_mode = norm_mode(m.group(1))
        continue
    m = re_seed.match(line)
    if m:
        current_seed = int(m.group(1))
        in_seed_block = True
        continue
    if in_seed_block:
        mt = re_target.match(line)
        if mt:
            current_target = mt.group(1).lower()
            continue
        ma = re_accuracy.match(line)
        if ma and current_target is not None:
            acc = float(ma.group(1))
            records.append({
                "domain": current_domain,
                "mode": current_mode,
                "seed": current_seed,
                "target": current_target,
                "accuracy": acc,
            })

df = pd.DataFrame(records)

def mean_excl_original(group):
    vals = [v for t, v in zip(group["target"], group["accuracy"]) if t != "original"]
    return sum(vals)/len(vals) if vals else float("nan")

agg = (
    df.groupby(["domain","mode","seed"])
      .apply(lambda g: mean_excl_original(g.reset_index(drop=True)))
      .reset_index(name="mean_accuracy")
)

for _, row in agg.iterrows():
    print(f"{row['domain']}, {row['mode']}, seed {row['seed']}: {row['mean_accuracy']:}")


caltech101, average, seed 0: 0.6148
caltech101, average, seed 7: 0.6148
caltech101, average, seed 42: 0.6148
caltech101, selective_0_1, seed 0: 0.9644333333333334
caltech101, selective_0_1, seed 7: 0.9404
caltech101, selective_0_1, seed 42: 0.9686666666666667
caltech101, selective_0_2, seed 0: 0.8859666666666666
caltech101, selective_0_2, seed 7: 0.8379
caltech101, selective_0_2, seed 42: 0.9225
caltech101, selective_0_3, seed 0: 0.6148
caltech101, selective_0_3, seed 7: 0.43883333333333335
caltech101, selective_0_3, seed 42: 0.6148
caltech101, selective_1_2, seed 0: 0.8622
caltech101, selective_1_2, seed 7: 0.8158
caltech101, selective_1_2, seed 42: 0.9088333333333334
caltech101, selective_1_3, seed 0: 0.6148
caltech101, selective_1_3, seed 7: 0.43883333333333335
caltech101, selective_1_3, seed 42: 0.6148
caltech101, selective_2_3, seed 0: 0.6148
caltech101, selective_2_3, seed 7: 0.43883333333333335
caltech101, selective_2_3, seed 42: 0.6148
caltech101, single_0, seed 0: 0.9767
calte

  .apply(lambda g: mean_excl_original(g.reset_index(drop=True)))


In [None]:
FILE = Path("path/to/file")

# %% Parser
test_pat = re.compile(r"^=== TEST DOMAIN:\s+([A-Za-z0-9_]+)\s+===")
mode_pat  = re.compile(r"^Mode:\s+([A-Z0-9_]+)")
seed_pat  = re.compile(r"^Seed:\s+(\d+)")
tgt_pat = re.compile(r"^\s*Target\s+([A-Za-z0-9_]+):\s*$")
acc_pat   = re.compile(r"^\s*Accuracy:\s*([0-9]*\.?[0-9]+)\s*$")

def normalize_domain(name):
    name = name.strip()
    name = name.lower()
    name = name.replace(" ", "_")
    return name

rows = []  # dict(test_domain, mode, seed, target_domain, acc)

cur_test = None
cur_mode = None
cur_seed = None
cur_target = None

with FILE.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.rstrip("\n")

        m = test_pat.match(line)
        if m:
            cur_test = normalize_domain(m.group(1))
            cur_mode = None
            cur_seed = None
            cur_target = None
            continue

        m = mode_pat.match(line)
        if m:
            cur_mode = m.group(1).upper()  # e.g., SINGLE_0, SELECTIVE_0_1, AVERAGE
            cur_seed = None
            cur_target = None
            continue

        m = seed_pat.match(line)
        if m:
            cur_seed = int(m.group(1))
            cur_target = None
            continue

        m = tgt_pat.match(line)
        if m:
            cur_target = normalize_domain(m.group(1))
            continue

        m = acc_pat.match(line)
        if m and cur_test and cur_mode and cur_seed is not None and cur_target:
            acc = float(m.group(1))
            rows.append({
                "test_domain": cur_test,
                "mode": cur_mode,
                "seed": cur_seed,
                "target_domain": cur_target,
                "accuracy": acc
            })
            continue

df = pd.DataFrame(rows)

df_no_orig = df[df["target_domain"] != "original"].copy()

agg = (
    df_no_orig
    .groupby(["test_domain", "mode", "target_domain"], as_index=False)
    .agg(mean_acc=("accuracy", "mean"),
         std_acc =("accuracy", "std"),
         n_seeds=("accuracy", "size"))
)

#domain_order = ["art_painting", "cartoon", "photo", "sketch"]
domain_order = ["caltech101", "labelme", "sun09", "voc2007"]

agg["test_domain"] = pd.Categorical(agg["test_domain"], domain_order, ordered=True)
agg["target_domain"] = pd.Categorical(agg["target_domain"], domain_order, ordered=True)
agg = agg.sort_values(["test_domain", "mode", "target_domain"]).reset_index(drop=True)

print("tidy table (one row per test_domain / mode / target_domain):")
display(agg)

def fmt(mean, std):
    if pd.isna(std):
        return f"{mean:.4f} ± nan"
    return f"{mean:.4f} ± {std:.4f}"

pivot = (
    agg
    .assign(mean_std=agg.apply(lambda r: fmt(r["mean_acc"], r["std_acc"]), axis=1))
    .pivot_table(index=["test_domain", "mode"], columns="target_domain", values="mean_std", aggfunc="first")
    .reindex(columns=[d for d in domain_order if d in agg["target_domain"].unique()])
    .sort_index()
)

print("\nPivot (mean ± std):")
display(pivot)

tidy table (one row per test_domain / mode / target_domain):


Unnamed: 0,test_domain,mode,target_domain,mean_acc,std_acc,n_seeds
0,caltech101,AVERAGE,labelme,0.614800,0.000000,3
1,caltech101,AVERAGE,sun09,0.614800,0.000000,3
2,caltech101,AVERAGE,voc2007,0.614800,0.000000,3
3,caltech101,SELECTIVE_0_1,labelme,0.957100,0.016074,3
4,caltech101,SELECTIVE_0_1,sun09,0.956433,0.017381,3
...,...,...,...,...,...,...
127,voc2007,SINGLE_2,labelme,0.663500,0.063876,3
128,voc2007,SINGLE_2,sun09,0.709033,0.016929,3
129,voc2007,SINGLE_3,caltech101,0.444000,0.000000,3
130,voc2007,SINGLE_3,labelme,0.286000,0.136832,3



Pivot (mean ± std):


  agg


Unnamed: 0_level_0,target_domain,caltech101,labelme,sun09,voc2007
test_domain,mode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
caltech101,AVERAGE,,0.6148 ± 0.0000,0.6148 ± 0.0000,0.6148 ± 0.0000
caltech101,SELECTIVE_0_1,,0.9571 ± 0.0161,0.9564 ± 0.0174,0.9600 ± 0.0125
caltech101,SELECTIVE_0_2,,0.8582 ± 0.0481,0.8850 ± 0.0460,0.9032 ± 0.0336
caltech101,SELECTIVE_0_3,,0.4388 ± 0.3048,0.6148 ± 0.0000,0.6148 ± 0.0000
caltech101,SELECTIVE_1_2,,0.8349 ± 0.0582,0.8565 ± 0.0509,0.8954 ± 0.0315
caltech101,SELECTIVE_1_3,,0.4388 ± 0.3048,0.6148 ± 0.0000,0.6148 ± 0.0000
caltech101,SELECTIVE_2_3,,0.4388 ± 0.3048,0.6148 ± 0.0000,0.6148 ± 0.0000
caltech101,SINGLE_0,,0.9774 ± 0.0056,0.9750 ± 0.0057,0.9755 ± 0.0043
caltech101,SINGLE_1,,0.9628 ± 0.0138,0.9656 ± 0.0134,0.9668 ± 0.0105
caltech101,SINGLE_2,,0.8758 ± 0.0456,0.9008 ± 0.0402,0.9277 ± 0.0314


**EXPERIMENT: Linear Mixed Model for Statistical Evaluation**

In [None]:
df = pd.read_csv("csv-file/with/all/accuracies/per/mode/and/seed")

df = df.rename(columns={
    "domain": "domain",
    "seed": "seed",
    "approach": "approach",
    "mode": "mode",
    "mean accuracy": "accuracy"
})
df["accuracy"] = pd.to_numeric(df["accuracy"], errors="coerce")
df = df.dropna(subset=["accuracy", "domain", "seed", "approach", "mode"]).copy()

# categories
df["approach"] = pd.Categorical(df["approach"], categories=["MixStyle", "TTA"])
df.loc[df["approach"] == "MixStyle", "mode"] = "base"
df["mode"] = pd.Categorical(df["mode"])

# domain × seed id for random effects
df["domain_seed"] = df["domain"].astype(str) + "_" + df["seed"].astype(str)

# ----------------------
# 1) TTA-only
# ----------------------
df_tta = df[df["approach"] == "TTA"].copy()
df_tta["mode"] = df_tta["mode"].cat.remove_unused_categories()

vc_tta = {"seed": "0 + C(domain_seed)"}
model_tta = smf.mixedlm(
    "accuracy ~ C(mode)",
    data=df_tta,
    groups=df_tta["domain"],
    vc_formula=vc_tta
)
res_tta = model_tta.fit(method="lbfgs", reml=False, maxiter=200)
print("\n=== TTA-only: Mode-Effekte ===")
print(res_tta.summary())

# ----------------------
# 2) MixStyle (base) vs TTA modes
# ----------------------
best_modes = ["single_0", "single_1", "selective_0_1"]

df_compare = df[
    ((df["approach"] == "MixStyle") & (df["mode"] == "base"))
    | ((df["approach"] == "TTA") & (df["mode"].isin(best_modes)))
].copy()

df_compare["mode"] = pd.Categorical(
    df_compare["mode"],
    categories=["base"] + [m for m in best_modes if m in df_compare["mode"].unique()]
)

vc_compare = {"seed": "0 + C(domain_seed)"}

model_compare = smf.mixedlm(
    "accuracy ~ C(mode, Treatment(reference='base'))",
    data=df_compare,
    groups=df_compare["domain"],
    vc_formula=vc_compare
)

try:
    res_compare = model_compare.fit(method="lbfgs", reml=False, maxiter=200)
except Exception:
    model_compare_simple = smf.mixedlm(
        "accuracy ~ C(mode, Treatment(reference='base'))",
        data=df_compare,
        groups=df_compare["domain"]
    )
    res_compare = model_compare_simple.fit(method="lbfgs", reml=False, maxiter=200)

print("\n=== MixStyle (base) vs. einzelne TTA-Modi ===")
print(res_compare.summary())

# ----------------------
# 3) Extract effect table
# ----------------------
def fixed_effects_table(res):
    ci = res.conf_int()
    return pd.DataFrame({
        "coef": res.params,
        "ci_low": ci[0],
        "ci_high": ci[1]
    })

print("\nFixed Effects (TTA-only):\n", fixed_effects_table(res_tta))
print("\nFixed Effects (MixStyle vs. beste TTA):\n", fixed_effects_table(res_compare))

# ----------------------
# 4) MixStyle (base) vs domain x mode
# ----------------------

modes_available = [m for m in df_compare["mode"].cat.categories if m != "base"]

# Wide-Table: rows = (domain, seed), columns = mode, values = accuracy
wide = (
    df_compare.pivot_table(index=["domain", "seed"], columns="mode", values="accuracy", aggfunc="mean")
    .reindex(columns=["base"] + modes_available)
)

results = []
for dom in wide.index.get_level_values("domain").unique():
    sub = wide.loc[dom]  # DataFrame: index=seed, cols=base + modes
    for mode in modes_available:
        diffs = (sub[mode] - sub["base"]).dropna()  # only seeds
        n = diffs.shape[0]
        if n == 0:
            mean_diff = np.nan
            sd = se = ci_low = ci_high = np.nan
        else:
            mean_diff = diffs.mean()
            sd = diffs.std(ddof=1) if n > 1 else 0.0
            se = sd / np.sqrt(n) if n > 1 else 0.0
            # 95%-CI with t-distribution
            if n > 1:
                t_crit = stats.t.ppf(0.975, df=n-1)
                ci_low = mean_diff - t_crit * se
                ci_high = mean_diff + t_crit * se
            else:
                ci_low = ci_high = np.nan

        results.append({
            "domain": dom,
            "mode": mode,
            "n_seed_pairs": n,
            "mean_improvement_over_base": mean_diff,
            "sd_diff": sd,
            "se_diff": se,
            "ci95_low": ci_low,
            "ci95_high": ci_high,
        })

improvement_table = pd.DataFrame(results).sort_values(["domain", "mode"]).reset_index(drop=True)
print("\n=== Domains × Mode: Improvement over base (paired per seed) ===")
print(improvement_table)



=== TTA-only: Mode-Effekte ===
              Mixed Linear Model Regression Results
Model:                MixedLM     Dependent Variable:     accuracy
No. Observations:     132         Method:                 ML      
No. Groups:           4           Scale:                  0.0021  
Min. group size:      33          Log-Likelihood:         188.1686
Max. group size:      33          Converged:              Yes     
Mean group size:      33.0                                        
------------------------------------------------------------------
                         Coef. Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------------------
Intercept                0.134    0.055  2.450 0.014  0.027  0.241
C(mode)[T.selective_0_1] 0.715    0.019 38.083 0.000  0.678  0.752
C(mode)[T.selective_0_2] 0.694    0.019 36.972 0.000  0.657  0.731
C(mode)[T.selective_0_3] 0.007    0.019  0.382 0.702 -0.030  0.044
C(mode)[T.selective_1_2] 0.692    0.019 36.86

  cov_aug_logdet = cov_re_logdet + np.sum(np.log(vc_var))
  solver = _smw_solver(1., ex_r, ex2_r, cov_re_inv, 1 / vc_var)
  ld = _smw_logdet(1., ex_r, ex2_r, cov_re_inv, 1 / vc_var,
  return B_logdet + ld + ld1
  solver = _smw_solver(1., ex_r, ex2_r, cov_re_inv, 1 / vc_var)
  df_compare.pivot_table(index=["domain", "seed"], columns="mode", values="accuracy", aggfunc="mean")


**paired t-test**

In [None]:
# 1) Data for MixStyle and single_1
df_pair = df[df["mode"].isin(["base", "single_1"])].copy()

# 2) Mean per Domain & Mode (aggregate seeds)
df_mean = df_pair.groupby(["domain", "mode"], observed=True)["accuracy"].mean().unstack("mode")

# 3) Paired t-test (Difference per domain: single_1 - base)
t_stat, p_val = stats.ttest_rel(df_mean["single_1"], df_mean["base"])

print("=== Paired t-Test: MixStyle (base) vs. TTA (single_1) ===")
print("t-Statistic:", t_stat)
print("p-Value:", p_val)
print("\nMean Accuracy (over seeds):")
print(df_mean)
print("\nMean Difference (single_1 - base):", (df_mean["single_1"] - df_mean["base"]).mean())


=== Paired t-Test: MixStyle (base) vs. TTA (single_1) ===
t-Statistic: -1.1724386859898208
p-Value: 0.3256505737153177

Mean Accuracy (over seeds):
mode            base  single_1
domain                        
Caltech101  0.977145  0.965067
LabelMe     0.648690  0.637889
SUN09       0.721142  0.701300
VOC2007     0.759445  0.770767

Mean Difference (single_1 - base): -0.007849908439288983
