In [9]:
#!/usr/bin/env python3
import sys, math, argparse
from collections import defaultdict
import random

def read_tsv(path):
    d = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            d[line.strip().split("\t")[0]] = float(line.strip().split("\t")[1])
    return d

def wilcoxon_signed_rank(diffs):
    import math  # safe to keep at the top; or rely on the module-level import
    # remove zeros
    nz = [(abs(x), x) for x in diffs if x != 0]
    if not nz:
        return math.nan, 1.0

    # rank absolute diffs (average ranks for ties)
    ranked = sorted(nz, key=lambda t: t[0])
    ranks = []
    i = 0
    while i < len(ranked):
        j = i
        while j < len(ranked) and ranked[j][0] == ranked[i][0]:
            j += 1
        avg_rank = (i + 1 + j) / 2.0
        for k in range(i, j):
            ranks.append((avg_rank, ranked[k][1]))
        i = j

    Wpos = sum(r for r, x in ranks if x > 0)
    Wneg = sum(r for r, x in ranks if x < 0)
    W = min(Wpos, Wneg)

    n = len(ranks)
    if n <= 20:
        # exact p-value by enumerating sign flips
        from itertools import product
        rvals = [r for r,_ in ranks]
        Ws = []
        for signs in product([1, -1], repeat=n):
            Wp = sum(r for r, s in zip(rvals, signs) if s > 0)
            Wn = sum(r for r, s in zip(rvals, signs) if s < 0)
            Ws.append(min(Wp, Wn))
        p = sum(1 for w in Ws if w <= W) / len(Ws)
        p = min(1.0, 2*p)  # two-sided
    else:
        # normal approximation (no full tie correction)
        mu = n*(n+1)/4
        sigma2 = n*(n+1)*(2*n+1)/24
        z = (W - mu + 0.5) / math.sqrt(sigma2)
        # two-sided p from z
        p = math.erfc(abs(z)/math.sqrt(2))
    return W, p


def paired_permutation_pvalue(diffs, n_perm=100000, seed=0):
    random.seed(seed)
    diffs = [float(x) for x in diffs]
    obs = sum(diffs)/len(diffs)
    # exact when n small (<=20): enumerate all 2^n sign flips
    n = len(diffs)
    if n <= 20:
        from itertools import product
        stats = []
        for signs in product([1,-1], repeat=n):
            s = sum(si*di for si,di in zip(signs,diffs))/n
            stats.append(s)
        p = sum(1 for s in stats if abs(s) >= abs(obs)) / len(stats)
        return obs, p, True
    else:
        # Monte Carlo permutation
        count = 0
        for _ in range(n_perm):
            s = sum((1 if random.random()<0.5 else -1)*di for di in diffs)/n
            if abs(s) >= abs(obs): 
                count += 1
        p = count / n_perm
        return obs, p, False

a = read_tsv("/mnt/project/exonhancer/ZENODO_REPO/TF_randomisation/results/hg38_pos_matchingTF_TFBS_perc_all_scoreFiltered.tsv")
b = read_tsv("/mnt/project/exonhancer/ZENODO_REPO/TF_randomisation/results/hg38_pos_matchingTF_TFBS_perc_all_scoreFiltered_shuffled.tsv")

ids = sorted(set(a).intersection(b))

x = [a[i] for i in ids]
y = [b[i] for i in ids]
diffs = [xi - yi for xi, yi in zip(x,y)]  # not_shuffled - shuffled

n = len(diffs)
mean_diff = sum(diffs)/n
med_diff = sorted(diffs)[n//2] if n%2==1 else sum(sorted(diffs)[n//2-1:n//2+1])/2

obs, p_perm, exact = paired_permutation_pvalue(diffs)
W, p_wil = wilcoxon_signed_rank(diffs)

# Paired t-test (for reference)
import math
md = mean_diff
sd = math.sqrt(sum((d-md)**2 for d in diffs)/(n-1)) if n>1 else float('nan')
se = sd/math.sqrt(n) if n>0 else float('nan')
t = md/se if se and se>0 else float('nan')
# two-sided p (Student's t)
try:
    import mpmath as mp  # if available
    df = n-1
    p_t = 2*(1-mp.gammainc((df+1)/2, 0, df/(df+t*t))/mp.gamma((df+1)/2))
    p_t = float(p_t)
except Exception:
    p_t = float('nan')

print(f"Paired sample size (after ID match): {n}")
print(f"Mean difference (not_shuffled - shuffled): {mean_diff:.6g}")
print(f"Median difference: {med_diff:.6g}")
print(f"Permutation test on mean: p = {p_perm:.6g} ({'exact' if exact else 'Monte Carlo'})")
print(f"Wilcoxon signed-rank: W = {W:.3g}, p = {p_wil:.6g}")
print(f"Paired t-test (approx): t = {t:.3g}, df = {n-1}, p ≈ {p_t if not math.isnan(p_t) else 'N/A'}")


Paired sample size (after ID match): 394190
Mean difference (not_shuffled - shuffled): 28.8478
Median difference: 25
Permutation test on mean: p = 0 (Monte Carlo)
Wilcoxon signed-rank: W = 3.12e+09, p = 0
Paired t-test (approx): t = 587, df = 394189, p ≈ N/A
