In [1]:
import pandas as pd
import pingouin as pg
import os
from scipy.stats import ttest_ind
import numpy as np


In [None]:
df = pd.read_excel("/input.xlsx")

df_selected = df[["Percent", "Property", "Independent_Cohen's_d"]]
df_selected["Percent"] = df_selected["Percent"].str.replace(".txt", "", regex=False)
df_pivot = df_selected.pivot(index="Percent", columns="Property", values="Independent_Cohen's_d")
df_pivot.reset_index(inplace=True)

df_pivot.to_excel("./output.xlsx", index=False)

In [None]:
input_dir =  "./"

files = [f for f in os.listdir(input_dir) if f.startswith("SB_cohens_d_abun_rep") and f.endswith(".xlsx")]

# DataFrame
df_list = []

for file in files:
    df = pd.read_excel(os.path.join(input_dir, file))
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)

# 
mean_df = combined_df.groupby(["Percent", "Property"], as_index=False)[
    ["Paired_Cohen's_d", "Independent_Cohen's_d"]
].mean()

# 
mean_df.to_excel("./mean.xlsx", index=False)

print(mean_df.head())


In [None]:
## combine paired and independent Cohen's d
#
time1_root = "./tp1"
time2_root = "./tp2"


file = "target.xlsx"
percent_list = ["Comb_In_5.txt", "Comb_In_10.txt", "Comb_In_15.txt", "Comb_In_20.txt", "Comb_In_25.txt", "Comb_In_30.txt",
                    "In_sub_5.txt", "In_sub_10.txt", "In_sub_15.txt", "In_sub_20.txt", "In_sub_25.txt", "In_sub_30.txt"]


# 获取病人文件夹
patient_folders = sorted([
    f for f in os.listdir(time1_root)
    if os.path.isdir(os.path.join(time1_root, f)) and not f.startswith(".")
])

all_results = []

for target_percent in percent_list:
    data_time1 = []
    data_time2 = []

    for patient in patient_folders:           
        file1 = os.path.join(time1_root, patient, file)
        file2 = os.path.join(time2_root, patient, file)

        if not os.path.exists(file1) or not os.path.exists(file2):
            print(f"Missing file in: {patient}")
            continue

        df1 = pd.read_excel(file1)
        df2 = pd.read_excel(file2)

        row1 = df1[df1['percent'] == target_percent]
        row2 = df2[df2['percent'] == target_percent]

        if not row1.empty and not row2.empty:
            data_time1.append(row1.iloc[0, 1:].astype(float).values)
            data_time2.append(row2.iloc[0, 1:].astype(float).values)
        else:
            print(f"Missing target row {target_percent} in {patient}")

    if len(data_time1) == 0 or len(data_time2) == 0:
        print(f"Skipping {target_percent} due to missing data")
        continue

    df_time1 = pd.DataFrame(data_time1)
    df_time2 = pd.DataFrame(data_time2)
    columns = df1.columns[1:]
    df_time1.columns = columns
    df_time2.columns = columns

    results = []

    for col in columns:
        x = df_time1[col].dropna()
        y = df_time2[col].dropna()

        # paired samples
        if len(x) == len(y) and len(x) > 1:
            try:
                paired_d = pg.compute_effsize(x, y, paired=True, eftype='cohen')
            except Exception:
                paired_d = np.nan
        else:
            paired_d = np.nan

        # independent samples
        if len(x) > 1 and len(y) > 1:
            try:
                independent_d = pg.compute_effsize(x, y, paired=False, eftype='cohen')
            except Exception:
                independent_d = np.nan
        else:
            independent_d = np.nan

        results.append({
            "Percent": target_percent,
            "Property": col,
            "Paired_Cohen's_d": paired_d,
            "Independent_Cohen's_d": independent_d
        })

    all_results.extend(results)

all_df = pd.DataFrame(all_results)
outdir = "./effectsize/" + "_cohens_d_" + ".xlsx"
all_df.to_excel(outdir, index=False)
print("All percent comparisons completed and saved.")
