In [33]:
import numpy as np
from scipy.stats import ttest_rel, t
from statsmodels.stats.power import TTestPower
from statsmodels.stats.weightstats import ttost_paired
from scipy.stats import shapiro
from scipy import stats
from scipy.stats import t, chi2, norm
import os
from os.path import join
import csv

In [34]:
def load_data(ques):
    path = "./RawData/"
    filenames = os.listdir(path)
    yoro = []
    conventional=[]
    for filename in filenames:
        with open(join(path, filename)) as f:
            f_csv = csv.reader(f)
            if ques == "Image Quality":
                next(f_csv)
                next(f_csv)
            yoro.append([float(_) for _ in next(f_csv)[:-1]])
            conventional.append([float(_) for _ in next(f_csv)[:-1]])
    return np.array([np.array(yoro),np.array(conventional)])

In [35]:
def cohens_d_paired(data1, data2):
    differences = data1 - data2
    mean_difference = np.mean(differences)
    std_difference = np.std(differences, ddof=1)
    return np.abs(mean_difference) / std_difference


def cohens_d_ind(data1, data2):
    mean1, mean2 = np.mean(data1), np.mean(data2)
    sd1, sd2 = np.std(data1, ddof=1), np.std(data2, ddof=1)
    pooled_sd = np.sqrt(((len(data1) - 1) * sd1 ** 2 + (len(data2) - 1) * sd2 ** 2) / (len(data1) + len(data2) - 2))
    return (mean1 - mean2) / pooled_sd

def calculate(ques):
    data = load_data(ques)
    yoro_scores = np.array(data[0])
    conventional_scores = np.array(data[1])

    # Average the scores for each scene
    yoro_avg = np.mean(yoro_scores, axis=1)
    conventional_avg = np.mean(conventional_scores, axis=1)

    print(f"---------------{ques}---------------")
    print("---------------Normality---------------")
    print("Shapiro-Wilk test results for mode A:", shapiro(yoro_avg))
    print("Shapiro-Wilk test results for mode B:", shapiro(conventional_avg))

    print(f"YORO average scores: {yoro_avg}, mean: {np.mean(yoro_avg):.4f}, std: {np.std(yoro_avg, ddof=1):.4f}")
    print(f"Conventional average scores: {conventional_avg}, mean: {np.mean(conventional_avg):.4f}, std: {np.std(conventional_avg, ddof=1):.4f}")

    print()
    print("---------------Significance test (paired t test)---------------")

    # Perform a paired t-test
    t_statistic, p_value = ttest_rel(yoro_avg, conventional_avg)

    print(f"Paired t-test results:\nT-statistic: {t_statistic:.4f}\nP-value: {p_value:.4f}")

    if p_value < 0.05:
        print("There is a significant difference between the YORO and conventional modes.")
    else:
        print("There is no significant difference between the YORO and conventional modes.")

    # Calculate effect size (Cohen's d)
    n = len(yoro_avg)

    effect_size = cohens_d_paired(yoro_avg, conventional_avg)
    print(f"Effect size (Cohen's d): {effect_size:.4f}")

    # Calculate statistical power
    power_analysis = TTestPower()
    power = power_analysis.solve_power(effect_size=effect_size, nobs=n, alpha=0.05)
    print(f"Statistical power: {power}")

    print()
    print("----------------Equivalence test (TOST)-----------------")

    # Perform equivalence test (TOST)
    lower_bound, upper_bound = -0.5, 0.5  # Set equivalence bounds
    tost_result = ttost_paired(yoro_avg, conventional_avg, low=lower_bound, upp=upper_bound)
    print(f"tost_result: {tost_result}")
    tost_p_values = [tost_result[1][1], tost_result[2][1]]
    tost_conclusion = "The modes are equivalent." if all(p < 0.05 for p in tost_p_values) else "The modes are not equivalent."

    mean_diff = np.mean(yoro_avg - conventional_avg)
    std_diff = np.std(yoro_avg - conventional_avg, ddof=1)

    # Calculate degrees of freedom
    df = len(yoro_avg) - 1

    # Calculate the standard error
    se = std_diff / np.sqrt(len(yoro_avg))

    # Calculate the confidence interval
    t_critical = stats.t.ppf(1 - 0.05 / 2, df)  # two-tailed test
    confidence_interval = [mean_diff - t_critical * se, mean_diff + t_critical * se]
    print("Confidence interval of the mean difference: ", confidence_interval)

    print("Equivalence test (TOST) results:")
    print(f"Lower TOST p-value: {tost_p_values[0]}")
    print(f"Upper TOST p-value: {tost_p_values[1]}")
    print(tost_conclusion)
    print()

In [36]:
calculate("Image Quality")
calculate("Overall Experience")

---------------Image Quality---------------
---------------Normality---------------
Shapiro-Wilk test results for mode A: ShapiroResult(statistic=0.968620777130127, pvalue=0.8576763868331909)
Shapiro-Wilk test results for mode B: ShapiroResult(statistic=0.9880751371383667, pvalue=0.9985337257385254)
YORO average scores: [3.55599913 3.0298935  3.490172   4.00148438 3.62571675 3.49344788
 3.81981975 3.06403562 4.19686775 3.383958   2.78035413 4.65274375
 3.02457    4.02620825], mean: 3.5818, std: 0.5218
Conventional average scores: [3.48029262 2.944308   3.388104   4.0142815  3.87525588 3.5012285
 3.80927538 3.12653562 4.21258213 3.26412787 2.41507975 4.64946775
 3.1158885  4.0342445 ], mean: 3.5593, std: 0.5823

---------------Significance test (paired t test)---------------
Paired t-test results:
T-statistic: 0.6196
P-value: 0.5462
There is no significant difference between the YORO and conventional modes.
Effect size (Cohen's d): 0.1656
Statistical power: 0.08866565882919376

--------