In [8]:
### Load Data

#https://doi.org/10.1093/molbev/msz063

##Â EE
ee_pn={}
ee_ps={}
with open("/mnt/project/exonhancer/ZENODO_REPO_DISCARDED_JC_DONT_TOUCH/gnomADv3_analysis/0_get_snp_gnomad/exons/vcf_by_chr/hg38_ee_gnomAD_pNpS.tsv") as file:
    for line in file:   
        try:         
            ee_pn[line.strip().split()[0]] = int(line.strip().split()[2])
            ee_ps[line.strip().split()[0]] = int(line.strip().split()[3])
        except ValueError:
            continue

ee_dn={}
ee_ds={} 
with open("/home/mouren/Data/revisions/dn_ds/ee_substitution_results_count_hg_mm.txt") as file:
    for line in file:   
        try:         
            ee_dn[line.strip().split()[0]] = int(line.strip().split()[2])
            ee_ds[line.strip().split()[0]] = int(line.strip().split()[1])
        except ValueError:
            continue

## NEG
neg_pn={}
neg_ps={}
with open("/mnt/project/exonhancer/ZENODO_REPO_DISCARDED_JC_DONT_TOUCH/gnomADv3_analysis/0_get_snp_gnomad/exons/vcf_by_chr/hg38_neg_gnomAD_pNpS.tsv") as file:
    for line in file:            
        try:
            neg_pn[line.strip().split()[0]] = int(line.strip().split()[2])
            neg_ps[line.strip().split()[0]] = int(line.strip().split()[3])
        except ValueError:
            continue

neg_dn={}
neg_ds={} 
with open("/home/mouren/Data/revisions/dn_ds/neg_substitution_results_count_hg_mm.txt") as file:
    for line in file:   
        try:         
            neg_dn[line.strip().split()[0]] = int(line.strip().split()[2])
            neg_ds[line.strip().split()[0]] = int(line.strip().split()[1])
        except ValueError:
            continue


In [9]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu

# ========== Compute DoS and FI ==========
def compute_dos_fi(pn, ps, dn, ds):
    try:
        if any(x <= 0 for x in [pn + ps, dn + ds, ds, ps]):
            return None, None
        dos = (dn / (dn + ds)) - (pn / (pn + ps))
        fi = (dn / ds) / (pn / ps)
        return dos, fi
    except ZeroDivisionError:
        return None, None

# Store results
ee_results = []
neg_results = []

# === EE Dataset ===
for exon in ee_pn:
    if exon in ee_ps and exon in ee_dn and exon in ee_ds:
        pn = ee_pn[exon]
        ps = ee_ps[exon]
        dn = ee_dn[exon]
        ds = ee_ds[exon]
        dos, fi = compute_dos_fi(pn, ps, dn, ds)
        if dos is not None:
            ee_results.append([exon, pn, ps, dn, ds, dos, fi])

# === NEG Dataset ===
for exon in neg_pn:
    if exon in neg_ps and exon in neg_dn and exon in neg_ds:
        pn = neg_pn[exon]
        ps = neg_ps[exon]
        dn = neg_dn[exon]
        ds = neg_ds[exon]
        dos, fi = compute_dos_fi(pn, ps, dn, ds)
        if dos is not None:
            neg_results.append([exon, pn, ps, dn, ds, dos, fi])

# Convert to DataFrames
ee_df = pd.DataFrame(ee_results, columns=["Exon", "pN", "pS", "dN", "dS", "DoS", "FixationIndex"])
neg_df = pd.DataFrame(neg_results, columns=["Exon", "pN", "pS", "dN", "dS", "DoS", "FixationIndex"])

# Save as TSVs
ee_df.to_csv("/home/mouren/Data/revisions/dn_ds/EE_DoS_FI.tsv", sep="\t", index=False)
neg_df.to_csv("/home/mouren/Data/revisions/dn_ds/NEG_DoS_FI.tsv", sep="\t", index=False)

# ========== Statistical Tests ==========
# Mann-Whitney U test for DoS
dos_stat, dos_p = mannwhitneyu(ee_df["DoS"], neg_df["DoS"], alternative="two-sided")
print(f"\nDoS Test:")
print(f"  EE median: {np.median(ee_df['DoS'])}")
print(f"  NEG median: {np.median(neg_df['DoS'])}")
print(f"  Mann-Whitney U: {dos_stat:.3f}, p-value: {dos_p:.4g}")

# Mann-Whitney U test for Fixation Index
fi_stat, fi_p = mannwhitneyu(ee_df["FixationIndex"], neg_df["FixationIndex"], alternative="two-sided")
print(f"\nFixation Index Test:")
print(f"  EE median: {np.median(ee_df['FixationIndex'])}")
print(f"  NEG median: {np.median(neg_df['FixationIndex'])}")
print(f"  Mann-Whitney U: {fi_stat:.3f}, p-value: {fi_p:.4g}")



DoS Test:
  EE median: 0.042763157894736836
  NEG median: -0.08333333333333337
  Mann-Whitney U: 24696462.500, p-value: 4.206e-39

Fixation Index Test:
  EE median: 1.2352941176470589
  NEG median: 0.6666666666666666
  Mann-Whitney U: 24842135.000, p-value: 4.688e-43
