# Conservation statistics
This script calculates phyloP scores across NMD regions. 
Later, it also stratifies by constrained and unconstrained regions.
Conservation is summarised as "fracCdsCons", or the fraction of sites which are conserved. Conserved sites are defined as those with phyloP >2.27, as per the Zoonomia papers.

## Preliminaries

Download NMD annotations and phyloP scores from UKB RAP

In [2]:
%%bash
dx download -f -o ../outputs/ outputs/nmd_annotations.tsv
dx download -f -o ../outputs/ outputs/phylop_all_sites.tsv
dx download -f -o ../outputs/ outputs/pext_38.bed
dx download -f -o ../outputs/ outputs/hmc_38.tsv
dx download -f -o ../outputs/ outputs/gene_ids.tsv

In [3]:
# Import relevant modules
import numpy as np
import pandas as pd

In [5]:
# Read NMD data into memory
nmd = pd.read_csv(
    "../outputs/nmd_annotations.tsv",
    sep="\t",
    usecols=["chr", "pos", "transcript_id", "nmd_definitive"],
).rename(columns={"nmd_definitive": "region", "transcript_id": "enst"})

# Read phyloP data into memory
phylop = pd.read_csv("../outputs/phylop_all_sites.tsv", sep="\t")

# Print summary statistics
print(f"Sites with an NMD annotation: {len(nmd)}")
print(f"Sites with a phyloP annotation: {len(phylop)}")

### pext annotations

In [None]:
# Read pext data into memory
pext = (
    pd.read_csv(
        "../outputs/pext_38.bed", 
        sep="\t", header=None, names=["chr","start","end","ensg","pext"], 
        usecols=["chr","end","ensg","pext"]
    )
    .rename(columns={"end":"pos"})
    .drop_duplicates()
    .drop_duplicates(["chr","pos","ensg"], keep=False)
)
print(f"Valid pext annotations: {len(pext)}")

In [None]:
# Read gene and transcript ids into memory
ids = (
    pd.read_csv(
        "../outputs/gene_ids.tsv", 
        sep="\t", 
        header=0,
        names=["ensg","enst","hgnc"], 
        usecols=["ensg","enst"]
    )
)
ids["ensg"] = ids["ensg"].str.split(".").str[0]
ids["enst"] = ids["enst"].str.split(".").str[0]

ids = ids.drop_duplicates()

pext = pext.merge(ids, how="left").dropna().drop("ensg", axis=1)
print(f"Valid pext annotations in genes with a MANE transcript: {len(pext)}")

### HMC annotations

## Regional annotations
Calculate the proportion of highly conserved sites across all regions, irrespective of constraint.

In [22]:
# Merge the NMD and phyloP annotations
df = nmd.merge(phylop, how="inner")
print(f"Sites after merging NMD and phyloP annotations: {len(df)}")

Sites after merging NMD and phyloP annotations: 34055400


In [23]:
# In order to get transcript-level statistics, we copy the dataframe and overwrite the "region" annotation.
_ = df.copy().assign(region="transcript")
df = pd.concat([df, _])

In [24]:
# Annotate conserved sites
df["fracCdsCons"] = np.where(df["phylop"] >= 2.27, 1, 0)

# Get summary statistics per region
def se(p, n):
    """Calculate the standard error of a propotion."""
    return np.sqrt((p * (1 - p))/n)

stats = (
    df.groupby("region")
    .agg({"fracCdsCons": "mean", "pos": "count"})
    .rename(columns={"pos": "n"})
)
stats["se"] = se(stats["fracCdsCons"], stats["n"])
stats["ci95"] = 1.96 * stats["se"]

stats

Unnamed: 0_level_0,fracCdsCons,n,se,ci95
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
distal_nmd,0.494952,7474963,0.000183,0.000358
long_exon,0.460485,2703867,0.000303,0.000594
nmd_target,0.625169,20893422,0.000106,0.000208
start_proximal,0.547126,2983148,0.000288,0.000565
transcript,0.576675,34055400,8.5e-05,0.000166


In [12]:
# Write regional statistics to output
stats.to_csv("../outputs/phylop_stats_region.tsv", sep="\t", index=False)

## Stratify by constraint

In [31]:
# Read the constraint data into memory
constraint = (
    pd.read_csv(
        "../outputs/expected_variants_all_regions_no_cpg_stats.tsv",
        sep="\t",
        usecols=["region", "enst", "csq", "n_obs", "oe", "z", "p", "fdr_p"],
    )
    .pivot( # We need, for example, synonymous Z-scores for later filtering
        index=["region", "enst"],
        columns="csq",
        values=["n_obs", "oe", "z", "p", "fdr_p"],
    )
    .swaplevel(
        axis=1,
    )
    .reset_index(
        drop=False,
    )
)

In [31]:
# Find constrained and unconstrained regions

## The columns are a multi-index which need to be merged
constraint.columns = ["_".join(x).strip("_") for x in constraint.columns.values]

## Keep only the relevant columns
constraint = constraint[
    [
        "region",
        "enst",
        "nonsense_n_obs",
        "nonsense_oe",
        "synonymous_z",
        "nonsense_p",
        "nonsense_fdr_p",
    ]
]

## Filter for constrained and unconstrained regions / transcripts
m1 = constraint["nonsense_oe"] < 0.35
m2 = constraint["synonymous_z"] > -1
m3 = constraint["nonsense_fdr_p"] < 0.05

m4 = constraint["nonsense_p"] >= 0.05
m5 = constraint["nonsense_n_obs"] >= 1

constraint.loc[m1 & m2 & m3, "constraint"] = "constrained"
constraint.loc[m4 & m5, "constraint"] = "unconstrained"

## Drop irrelevant columns, and regions which lack definitive constraint annotations 
constraint = constraint[["region", "enst", "constraint"]].dropna()

## Print the counts of constrained and unconstrained regions
print(constraint.groupby(["region"])["constraint"].value_counts())
constraint.head(3)

region          constraint   
distal_nmd      unconstrained    11233
                constrained        548
long_exon       unconstrained     1519
                constrained        388
nmd_target      unconstrained     7300
                constrained       2380
start_proximal  unconstrained    10933
transcript      unconstrained     8880
                constrained       3538
Name: constraint, dtype: int64


Unnamed: 0,region,enst,constraint
3,distal_nmd,ENST00000001008,unconstrained
5,distal_nmd,ENST00000002125,unconstrained
6,distal_nmd,ENST00000002165,unconstrained


In [25]:
# Merge regions and per-site phyloP scores with the constraint annotation
df = df.merge(constraint, how="inner")

In [28]:
# Get summary statistics
stats = df.groupby(["constraint","region"]).agg(fracCdsCons=("fracCdsCons","mean"), n=("pos","count")).reset_index(drop=False)
stats["se"] = se(stats["fracCdsCons"], stats["n"])
stats["ci95"] = 1.96 * stats["se"]

stats

Unnamed: 0,constraint,region,fracCdsCons,n,se,ci95
0,constrained,distal_nmd,0.617795,745153,0.000563,0.001103
1,constrained,long_exon,0.560887,725185,0.000583,0.001142
2,constrained,nmd_target,0.712081,4583708,0.000211,0.000415
3,constrained,transcript,0.69206,7666226,0.000167,0.000327
4,unconstrained,distal_nmd,0.467971,4112955,0.000246,0.000482
5,unconstrained,long_exon,0.415423,796455,0.000552,0.001082
6,unconstrained,nmd_target,0.553133,5933980,0.000204,0.0004
7,unconstrained,start_proximal,0.514563,1638562,0.00039,0.000765
8,unconstrained,transcript,0.492196,10329521,0.000156,0.000305


In [29]:
# Write to output
stats.to_csv("../outputs/phylop_stats_region_constraint.tsv", sep="\t", index=False)