# PhyloP, AlphaMissense, and pext scores
Generate summary statistics of phyloP, AlphaMissense, and pext scores in constrained / unconstrained regions.

In [61]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
# Imports
import numpy as np
import pandas as pd
from scipy import stats

from src import constants as C
from src.statistics_for_plots import utils

In [63]:
# Read data
df = (
    pd.read_csv(C.PHYLOP_PEXT_MISSENSE_STATS, sep="\t")
    .replace({"distal_nmd": "distal"})
    .pipe(utils.sort_region_column, categories=C.REGIONS[::-1], labels=C.REGION_LABELS[::-1])
    .dropna(subset="constraint")
)
df.head()

Unnamed: 0,region,constraint,metric,mean,n,sem,ci_95
0,Distal,all,phylop,2.803871,7160361,0.001329,0.002606
25,Distal,constrained,pext,0.857092,468935,0.000304,0.000596
30,Distal,unconstrained,pext,0.759885,1660241,0.000208,0.000407
45,Distal,constrained,alpha_mis,0.379032,357900,0.00057,0.001117
50,Distal,unconstrained,alpha_mis,0.279877,1752135,0.00021,0.000412


In [64]:
# Write to output
df.to_csv(C.STATS_PHYLOP_MISSENSE_PEXT, sep="\t", index=False)

## Welch's T-test
T-tests for constrained vs unconstrained regions

In [65]:
df.head(2)

Unnamed: 0,region,constraint,metric,mean,n,sem,ci_95
0,Distal,all,phylop,2.803871,7160361,0.001329,0.002606
25,Distal,constrained,pext,0.857092,468935,0.000304,0.000596


In [66]:
# Get standard deviations
df["std"] = df["sem"] * np.sqrt(df["n"])

In [67]:
# Reindex the dataframe
df = df.set_index(["constraint","region","metric"])

In [70]:
# Split by constraint
constrained = df.xs("constrained", level=0)[["mean","n","std"]]
unconstrained = df.xs("unconstrained", level=0)[["mean","n","std"]]

In [74]:
# Welch's T-test
stats.ttest_ind_from_stats(
    mean1=constrained["mean"],
    std1=constrained["std"],
    nobs1=constrained["n"],
    mean2=unconstrained["mean"],
    std2=unconstrained["std"],
    nobs2=unconstrained["n"],
    equal_var=False,
    alternative="two-sided",
).pvalue

region            metric   
Distal            alpha_mis    0.000000e+00
                  pext         0.000000e+00
                  phylop       0.000000e+00
Long exon         alpha_mis    0.000000e+00
                  pext         0.000000e+00
                  phylop       0.000000e+00
NMD target        alpha_mis    0.000000e+00
                  pext         0.000000e+00
                  phylop       0.000000e+00
Start proximal    alpha_mis    0.000000e+00
                  pext         1.681103e-21
                  phylop       0.000000e+00
Whole transcript  alpha_mis    0.000000e+00
                  pext         0.000000e+00
                  phylop       0.000000e+00
Name: n, dtype: float64

All scores are significantly different between constrained and unconstrained regions.