# 1. Install and Import Library

In [None]:
!pip install scikit_posthocs

In [None]:
!pip install scipy

In [None]:
import numpy as np
import pandas as pd
import scikit_posthocs as sp
from scipy import stats

# 2. Helper Functions

In [None]:
def run_model_comparison(data_array, names):
    stat, p_friedman = stats.friedmanchisquare(*data_array.T)

    print("-" * 30)
    print(f"FRIEDMAN TEST RESULTS")
    print(f"Statistic: {stat:.4f}")
    print(f"P-value:   {p_friedman:.4f}")
    print("-" * 30)

    if p_friedman < 0.05:
        print("\n[Significant difference found (p < 0.05)]")

        df = pd.DataFrame(data_array, columns=names)
        p_matrix = sp.posthoc_nemenyi_friedman(df)

        significant_pairs = []
        for i in range(len(names)):
            for j in range(i + 1, len(names)):
                p_val = p_matrix.iloc[i, j]
                if p_val < 0.05:
                    significant_pairs.append((names[i], names[j], p_val))

        if significant_pairs:
            print("Pairs with Statistically Significant Differences:")
            for m1, m2, p in significant_pairs:
                print(f" * {m1} vs {m2} (p = {p:.4f})")
        else:
            print("The Friedman test was significant, but the Nemenyi test")
            print("did not find specific pairs with p < 0.05.")

        return p_matrix
    else:
        print("\n[No significant difference found (p >= 0.05)]")
        return None

In [None]:
models = [
    'IndoBERT-base-p1',
    'IndoBERT-lite-base-p1',
    'IndoBERT-base-p2',
    'IndoBERT-lite-base-p2',
    'NusaBERT-base',
    'SVM'
]

# original

In [None]:
data_ori = np.array([
    [0.9572, 0.9558, 0.9591, 0.9595, 0.9609, 0.9473], # Accuracy
    [0.9583, 0.9573, 0.9603, 0.9608, 0.9618, 0.9491], # Precision
    [0.9572, 0.9558, 0.9591, 0.9595, 0.9609, 0.9473], # Recall
    [0.9574, 0.956,  0.9593, 0.9597, 0.9611, 0.9476]  # F1
])

In [None]:
run_model_comparison(data_ori, models)

# ros

In [None]:
data_ros = np.array([
    [0.9595, 0.96,   0.9586, 0.9576, 0.96,   0.9464], # Accuracy
    [0.9604, 0.9611, 0.9598, 0.959,  0.9607, 0.9481], # Precision
    [0.9595, 0.96,   0.9586, 0.9576, 0.96,   0.9464], # Recall
    [0.9597, 0.9602, 0.9588, 0.9579, 0.9602, 0.9467]  # F1
])

In [None]:
run_model_comparison(data_ros, models)

# ros_ncl

In [None]:
data_rosncl = np.array([
    [0.9614, 0.9562, 0.9586, 0.9562, 0.9553, 0.9478], # Accuracy
    [0.9625, 0.958,  0.96,   0.9579, 0.957,  0.9501], # Precision
    [0.9614, 0.9562, 0.9586, 0.9562, 0.9553, 0.9478], # Recall
    [0.9616, 0.9565, 0.9588, 0.9565, 0.9556, 0.9481]  # F1
])

In [None]:
run_model_comparison(data_rosncl, models)