In [1]:
import pandas as pd
import numpy as np

In [5]:
df_finetune = pd.read_csv('finetuned_scored_personalization_relevance.csv')
df_rag = pd.read_csv('rag_scored_personalization_relevance.csv')

In [7]:
df_finetune.columns

Index(['name', 'age', 'class_level', 'syllabus', 'interests', 'personality',
       'hobbies', 'learning_style', 'pattern_of_learning',
       'study_routine_start', 'study_routine_end', 'preferred_subjects',
       'struggles_with', 'strengths', 'challenges', 'academic_progress',
       'marks_last_year_final', 'marks_last_internals', 'emotional_traits',
       'motivation_style', 'group_behavior', 'social_skills',
       'teacher_feedback', 'student_voice', 'accomplishments',
       'digital_learning', 'tech_savviness', 'home_environment',
       'interesting_stories', 'personalization_score',
       'personalization_explanation', 'relevance_score',
       'relevance_explanation'],
      dtype='object')

In [8]:
finetune_scores_per = np.array(df_finetune['personalization_score'])  
rag_scores_per       = np.array(df_rag['personalization_score'])
finetune_scores_rel = np.array(df_finetune['relevance_score'])   
rag_scores_rel       = np.array(df_rag['relevance_score'])


In [10]:
from scipy.stats import shapiro

stat1, p_shapiro1 = shapiro((finetune_scores_per - rag_scores_per))
print("Shapiro p-value for personalization:", p_shapiro1)
stat2, p_shapiro2 = shapiro((finetune_scores_rel - rag_scores_rel))
print("Shapiro p-value for relevance:", p_shapiro2)



Shapiro p-value for personalization: 3.303840517710954e-05
Shapiro p-value for relevance: 2.4024304785361146e-10


In [11]:
from scipy.stats import ttest_rel, wilcoxon

alpha = 0.05
def test_significance(p_shapiro, finetune_scores, rag_scores):
    if p_shapiro > alpha:
        stat, p_val = ttest_rel(finetune_scores, rag_scores)
        test_name = "paired t-test"
    else:
        stat, p_val = wilcoxon(finetune_scores, rag_scores)
        test_name = "Wilcoxon signed-rank"

    print(f"{test_name}: statistic={stat:.3f}, p-value={p_val:.4f}")
    if p_val < alpha:
        print("⇒ Reject H₀: the mean/median difference is significant.")
    else:
        print("⇒ Fail to reject H₀: no significant difference found.")


In [13]:
print("Personalization Score:")
test_significance(p_shapiro1, finetune_scores_per, rag_scores_per)
print("Relevance Score:")
test_significance(p_shapiro2, finetune_scores_rel, rag_scores_rel)

Personalization Score:
Wilcoxon signed-rank: statistic=71.500, p-value=0.0000
⇒ Reject H₀: the mean/median difference is significant.
Relevance Score:
Wilcoxon signed-rank: statistic=135.000, p-value=0.2608
⇒ Fail to reject H₀: no significant difference found.


In [14]:
def find_cohen(diffs):
    import numpy as np
    mean_diff = np.mean(diffs)
    sd_diff   = np.std(diffs, ddof=1)
    cohens_d  = mean_diff / sd_diff
    return cohens_d

print("Cohen's d for Personalization:", find_cohen(finetune_scores_per - rag_scores_per))
print("Cohen's d for Relevance:", find_cohen(finetune_scores_rel - rag_scores_rel))


Cohen's d for Personalization: -0.575678471763037
Cohen's d for Relevance: 0.0


In [17]:
def find_CI(diffs):
    boot_ds = []
    for _ in range(5000):
        sample = np.random.choice(diffs, size=len(diffs), replace=True)
        mean_diff = sample.mean()
        sd_diff   = sample.std(ddof=1)
        boot_ds.append(mean_diff/sd_diff if sd_diff>0 else 0)

    ci_low, ci_high = np.percentile(boot_ds, [2.5, 97.5])
    return ci_low, ci_high

diiffs_per = finetune_scores_per - rag_scores_per
diiffs_rel = finetune_scores_rel - rag_scores_rel
ci_low, ci_high = find_CI(diiffs_per)
print(f"95% CI for Cohen's d for personalization: [{ci_low:.2f}, {ci_high:.2f}]")
ci_low, ci_high = find_CI(diiffs_rel)
print(f"95% CI for Cohen's d for relevance: [{ci_low:.2f}, {ci_high:.2f}]")


95% CI for Cohen's d for personalization: [-0.94, -0.28]
95% CI for Cohen's d for relevance: [-0.45, 0.21]
